1;******************************************************************************
2;* x86-optimized horizontal line scaling functions
3;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
4;*
5;* This file is part of Libav.
6;*
7;* Libav is free software; you can redistribute it and/or
8;* modify it under the terms of the GNU Lesser General Public
9;* License as published by the Free Software Foundation; either
10;* version 2.1 of the License, or (at your option) any later version.
11;*
12;* Libav is distributed in the hope that it will be useful,
13;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15;* Lesser General Public License for more details.
16;*
17;* You should have received a copy of the GNU Lesser General Public
18;* License along with Libav; if not, write to the Free Software
19;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20;******************************************************************************
21
22%include "x86inc.asm"
23%include "x86util.asm"
24
25SECTION_RODATA
26
27max_19bit_int: times 4 dd 0x7ffff
28max_19bit_flt: times 4 dd 524287.0
29minshort:      times 8 dw 0x8000
30unicoeff:      times 4 dd 0x20000000
31
32SECTION .text
33
34;-----------------------------------------------------------------------------
35; horizontal line scaling
36;
37; void hscale<source_width>to<intermediate_nbits>_<filterSize>_<opt>
38;                               (SwsContext *c, int{16,32}_t *dst,
39;                                int dstW, const uint{8,16}_t *src,
40;                                const int16_t *filter,
41;                                const int32_t *filterPos, int filterSize);
42;
43; Scale one horizontal line. Input is either 8-bits width or 16-bits width
44; ($source_width can be either 8, 9, 10 or 16, difference is whether we have to
45; downscale before multiplying). Filter is 14-bits. Output is either 15bits
46; (in int16_t) or 19bits (in int32_t), as given in $intermediate_nbits. Each
47; output pixel is generated from $filterSize input pixels, the position of
48; the first pixel is given in filterPos[nOutputPixel].
49;-----------------------------------------------------------------------------
50
51; SCALE_FUNC source_width, intermediate_nbits, filtersize, filtersuffix, opt, n_args, n_xmm
52%macro SCALE_FUNC 7
53cglobal hscale%1to%2_%4_%5, %6, 7, %7
54%ifdef ARCH_X86_64
55    movsxd        r2, r2d
56%define mov32 movsxd
57%else ; x86-32
58%define mov32 mov
59%endif ; x86-64
60%if %2 == 19
61%if mmsize == 8 ; mmx
62    mova          m2, [max_19bit_int]
63%elifidn %5, sse4
64    mova          m2, [max_19bit_int]
65%else ; ssse3/sse2
66    mova          m2, [max_19bit_flt]
67%endif ; mmx/sse2/ssse3/sse4
68%endif ; %2 == 19
69%if %1 == 16
70    mova          m6, [minshort]
71    mova          m7, [unicoeff]
72%elif %1 == 8
73    pxor          m3, m3
74%endif ; %1 == 8/16
75
76%if %1 == 8
77%define movlh movd
78%define movbh movh
79%define srcmul 1
80%else ; %1 == 9-16
81%define movlh movq
82%define movbh movu
83%define srcmul 2
84%endif ; %1 == 8/9-16
85
86%ifnidn %3, X
87
88    ; setup loop
89%if %3 == 8
90    shl           r2, 1                  ; this allows *16 (i.e. now *8) in lea instructions for the 8-tap filter
91%define r2shr 1
92%else ; %3 == 4
93%define r2shr 0
94%endif ; %3 == 8
95    lea           r4, [r4+r2*8]
96%if %2 == 15
97    lea           r1, [r1+r2*(2>>r2shr)]
98%else ; %2 == 19
99    lea           r1, [r1+r2*(4>>r2shr)]
100%endif ; %2 == 15/19
101    lea           r5, [r5+r2*(4>>r2shr)]
102    neg           r2
103
104.loop:
105%if %3 == 4 ; filterSize == 4 scaling
106    ; load 2x4 or 4x4 source pixels into m0/m1
107    mov32         r0, dword [r5+r2*4+0]  ; filterPos[0]
108    mov32         r6, dword [r5+r2*4+4]  ; filterPos[1]
109    movlh         m0, [r3+r0*srcmul]     ; src[filterPos[0] + {0,1,2,3}]
110%if mmsize == 8
111    movlh         m1, [r3+r6*srcmul]     ; src[filterPos[1] + {0,1,2,3}]
112%else ; mmsize == 16
113%if %1 > 8
114    movhps        m0, [r3+r6*srcmul]     ; src[filterPos[1] + {0,1,2,3}]
115%else ; %1 == 8
116    movd          m4, [r3+r6*srcmul]     ; src[filterPos[1] + {0,1,2,3}]
117%endif
118    mov32         r0, dword [r5+r2*4+8]  ; filterPos[2]
119    mov32         r6, dword [r5+r2*4+12] ; filterPos[3]
120    movlh         m1, [r3+r0*srcmul]     ; src[filterPos[2] + {0,1,2,3}]
121%if %1 > 8
122    movhps        m1, [r3+r6*srcmul]     ; src[filterPos[3] + {0,1,2,3}]
123%else ; %1 == 8
124    movd          m5, [r3+r6*srcmul]     ; src[filterPos[3] + {0,1,2,3}]
125    punpckldq     m0, m4
126    punpckldq     m1, m5
127%endif ; %1 == 8 && %5 <= ssse
128%endif ; mmsize == 8/16
129%if %1 == 8
130    punpcklbw     m0, m3                 ; byte -> word
131    punpcklbw     m1, m3                 ; byte -> word
132%endif ; %1 == 8
133
134    ; multiply with filter coefficients
135%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll
136             ; add back 0x8000 * sum(coeffs) after the horizontal add
137    psubw         m0, m6
138    psubw         m1, m6
139%endif ; %1 == 16
140    pmaddwd       m0, [r4+r2*8+mmsize*0] ; *= filter[{0,1,..,6,7}]
141    pmaddwd       m1, [r4+r2*8+mmsize*1] ; *= filter[{8,9,..,14,15}]
142
143    ; add up horizontally (4 srcpix * 4 coefficients -> 1 dstpix)
144%if mmsize == 8 ; mmx
145    movq          m4, m0
146    punpckldq     m0, m1
147    punpckhdq     m4, m1
148    paddd         m0, m4
149%elifidn %5, sse2
150    mova          m4, m0
151    shufps        m0, m1, 10001000b
152    shufps        m4, m1, 11011101b
153    paddd         m0, m4
154%else ; ssse3/sse4
155    phaddd        m0, m1                 ; filter[{ 0, 1, 2, 3}]*src[filterPos[0]+{0,1,2,3}],
156                                         ; filter[{ 4, 5, 6, 7}]*src[filterPos[1]+{0,1,2,3}],
157                                         ; filter[{ 8, 9,10,11}]*src[filterPos[2]+{0,1,2,3}],
158                                         ; filter[{12,13,14,15}]*src[filterPos[3]+{0,1,2,3}]
159%endif ; mmx/sse2/ssse3/sse4
160%else ; %3 == 8, i.e. filterSize == 8 scaling
161    ; load 2x8 or 4x8 source pixels into m0, m1, m4 and m5
162    mov32         r0, dword [r5+r2*2+0]  ; filterPos[0]
163    mov32         r6, dword [r5+r2*2+4]  ; filterPos[1]
164    movbh         m0, [r3+ r0   *srcmul] ; src[filterPos[0] + {0,1,2,3,4,5,6,7}]
165%if mmsize == 8
166    movbh         m1, [r3+(r0+4)*srcmul] ; src[filterPos[0] + {4,5,6,7}]
167    movbh         m4, [r3+ r6   *srcmul] ; src[filterPos[1] + {0,1,2,3}]
168    movbh         m5, [r3+(r6+4)*srcmul] ; src[filterPos[1] + {4,5,6,7}]
169%else ; mmsize == 16
170    movbh         m1, [r3+ r6   *srcmul] ; src[filterPos[1] + {0,1,2,3,4,5,6,7}]
171    mov32         r0, dword [r5+r2*2+8]  ; filterPos[2]
172    mov32         r6, dword [r5+r2*2+12] ; filterPos[3]
173    movbh         m4, [r3+ r0   *srcmul] ; src[filterPos[2] + {0,1,2,3,4,5,6,7}]
174    movbh         m5, [r3+ r6   *srcmul] ; src[filterPos[3] + {0,1,2,3,4,5,6,7}]
175%endif ; mmsize == 8/16
176%if %1 == 8
177    punpcklbw     m0, m3                 ; byte -> word
178    punpcklbw     m1, m3                 ; byte -> word
179    punpcklbw     m4, m3                 ; byte -> word
180    punpcklbw     m5, m3                 ; byte -> word
181%endif ; %1 == 8
182
183    ; multiply
184%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll
185             ; add back 0x8000 * sum(coeffs) after the horizontal add
186    psubw         m0, m6
187    psubw         m1, m6
188    psubw         m4, m6
189    psubw         m5, m6
190%endif ; %1 == 16
191    pmaddwd       m0, [r4+r2*8+mmsize*0] ; *= filter[{0,1,..,6,7}]
192    pmaddwd       m1, [r4+r2*8+mmsize*1] ; *= filter[{8,9,..,14,15}]
193    pmaddwd       m4, [r4+r2*8+mmsize*2] ; *= filter[{16,17,..,22,23}]
194    pmaddwd       m5, [r4+r2*8+mmsize*3] ; *= filter[{24,25,..,30,31}]
195
196    ; add up horizontally (8 srcpix * 8 coefficients -> 1 dstpix)
197%if mmsize == 8
198    paddd         m0, m1
199    paddd         m4, m5
200    movq          m1, m0
201    punpckldq     m0, m4
202    punpckhdq     m1, m4
203    paddd         m0, m1
204%elifidn %5, sse2
205%if %1 == 8
206%define mex m6
207%else
208%define mex m3
209%endif
210    ; emulate horizontal add as transpose + vertical add
211    mova         mex, m0
212    punpckldq     m0, m1
213    punpckhdq    mex, m1
214    paddd         m0, mex
215    mova          m1, m4
216    punpckldq     m4, m5
217    punpckhdq     m1, m5
218    paddd         m4, m1
219    mova          m1, m0
220    punpcklqdq    m0, m4
221    punpckhqdq    m1, m4
222    paddd         m0, m1
223%else ; ssse3/sse4
224    ; FIXME if we rearrange the filter in pairs of 4, we can
225    ; load pixels likewise and use 2 x paddd + phaddd instead
226    ; of 3 x phaddd here, faster on older cpus
227    phaddd        m0, m1
228    phaddd        m4, m5
229    phaddd        m0, m4                 ; filter[{ 0, 1,..., 6, 7}]*src[filterPos[0]+{0,1,...,6,7}],
230                                         ; filter[{ 8, 9,...,14,15}]*src[filterPos[1]+{0,1,...,6,7}],
231                                         ; filter[{16,17,...,22,23}]*src[filterPos[2]+{0,1,...,6,7}],
232                                         ; filter[{24,25,...,30,31}]*src[filterPos[3]+{0,1,...,6,7}]
233%endif ; mmx/sse2/ssse3/sse4
234%endif ; %3 == 4/8
235
236%else ; %3 == X, i.e. any filterSize scaling
237
238%ifidn %4, X4
239%define r6sub 4
240%else ; %4 == X || %4 == X8
241%define r6sub 0
242%endif ; %4 ==/!= X4
243%ifdef ARCH_X86_64
244    push         r12
245    movsxd        r6, r6d                ; filterSize
246    lea          r12, [r3+(r6-r6sub)*srcmul] ; &src[filterSize&~4]
247%define src_reg r11
248%define r1x     r10
249%define filter2 r12
250%else ; x86-32
251    lea           r0, [r3+(r6-r6sub)*srcmul] ; &src[filterSize&~4]
252    mov          r6m, r0
253%define src_reg r3
254%define r1x     r1
255%define filter2 r6m
256%endif ; x86-32/64
257    lea           r5, [r5+r2*4]
258%if %2 == 15
259    lea           r1, [r1+r2*2]
260%else ; %2 == 19
261    lea           r1, [r1+r2*4]
262%endif ; %2 == 15/19
263    movifnidn   r1mp, r1
264    neg           r2
265
266.loop:
267    mov32         r0, dword [r5+r2*4+0]  ; filterPos[0]
268    mov32        r1x, dword [r5+r2*4+4]  ; filterPos[1]
269    ; FIXME maybe do 4px/iteration on x86-64 (x86-32 wouldn't have enough regs)?
270    pxor          m4, m4
271    pxor          m5, m5
272    mov      src_reg, r3mp
273
274.innerloop:
275    ; load 2x4 (mmx) or 2x8 (sse) source pixels into m0/m1 -> m4/m5
276    movbh         m0, [src_reg+r0 *srcmul]    ; src[filterPos[0] + {0,1,2,3(,4,5,6,7)}]
277    movbh         m1, [src_reg+(r1x+r6sub)*srcmul]    ; src[filterPos[1] + {0,1,2,3(,4,5,6,7)}]
278%if %1 == 8
279    punpcklbw     m0, m3
280    punpcklbw     m1, m3
281%endif ; %1 == 8
282
283    ; multiply
284%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll
285             ; add back 0x8000 * sum(coeffs) after the horizontal add
286    psubw         m0, m6
287    psubw         m1, m6
288%endif ; %1 == 16
289    pmaddwd       m0, [r4     ]          ; filter[{0,1,2,3(,4,5,6,7)}]
290    pmaddwd       m1, [r4+(r6+r6sub)*2]          ; filter[filtersize+{0,1,2,3(,4,5,6,7)}]
291    paddd         m4, m0
292    paddd         m5, m1
293    add           r4, mmsize
294    add      src_reg, srcmul*mmsize/2
295    cmp      src_reg, filter2            ; while (src += 4) < &src[filterSize]
296    jl .innerloop
297
298%ifidn %4, X4
299    mov32        r1x, dword [r5+r2*4+4]  ; filterPos[1]
300    movlh         m0, [src_reg+r0 *srcmul] ; split last 4 srcpx of dstpx[0]
301    sub          r1x, r6                   ; and first 4 srcpx of dstpx[1]
302%if %1 > 8
303    movhps        m0, [src_reg+(r1x+r6sub)*srcmul]
304%else ; %1 == 8
305    movd          m1, [src_reg+(r1x+r6sub)*srcmul]
306    punpckldq     m0, m1
307%endif ; %1 == 8 && %5 <= ssse
308%if %1 == 8
309    punpcklbw     m0, m3
310%endif ; %1 == 8
311%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll
312             ; add back 0x8000 * sum(coeffs) after the horizontal add
313    psubw         m0, m6
314%endif ; %1 == 16
315    pmaddwd       m0, [r4]
316%endif ; %4 == X4
317
318    lea           r4, [r4+(r6+r6sub)*2]
319
320%if mmsize == 8 ; mmx
321    movq          m0, m4
322    punpckldq     m4, m5
323    punpckhdq     m0, m5
324    paddd         m0, m4
325%else ; mmsize == 16
326%ifidn %5, sse2
327    mova          m1, m4
328    punpcklqdq    m4, m5
329    punpckhqdq    m1, m5
330    paddd         m4, m1
331%else ; ssse3/sse4
332    phaddd        m4, m5
333%endif ; sse2/ssse3/sse4
334%ifidn %4, X4
335    paddd         m4, m0
336%endif ; %3 == X4
337%ifidn %5, sse2
338    pshufd        m4, m4, 11011000b
339    movhlps       m0, m4
340    paddd         m0, m4
341%else ; ssse3/sse4
342    phaddd        m4, m4
343    SWAP           0, 4
344%endif ; sse2/ssse3/sse4
345%endif ; mmsize == 8/16
346%endif ; %3 ==/!= X
347
348%if %1 == 16 ; add 0x8000 * sum(coeffs), i.e. back from signed -> unsigned
349    paddd         m0, m7
350%endif ; %1 == 16
351
352    ; clip, store
353    psrad         m0, 14 + %1 - %2
354%ifidn %3, X
355    movifnidn     r1, r1mp
356%endif ; %3 == X
357%if %2 == 15
358    packssdw      m0, m0
359%ifnidn %3, X
360    movh [r1+r2*(2>>r2shr)], m0
361%else ; %3 == X
362    movd   [r1+r2*2], m0
363%endif ; %3 ==/!= X
364%else ; %2 == 19
365%if mmsize == 8
366    PMINSD_MMX    m0, m2, m4
367%elifidn %5, sse4
368    pminsd        m0, m2
369%else ; sse2/ssse3
370    cvtdq2ps      m0, m0
371    minps         m0, m2
372    cvtps2dq      m0, m0
373%endif ; mmx/sse2/ssse3/sse4
374%ifnidn %3, X
375    mova [r1+r2*(4>>r2shr)], m0
376%else ; %3 == X
377    movq   [r1+r2*4], m0
378%endif ; %3 ==/!= X
379%endif ; %2 == 15/19
380%ifnidn %3, X
381    add           r2, (mmsize<<r2shr)/4  ; both 8tap and 4tap really only do 4 pixels (or for mmx: 2 pixels)
382                                         ; per iteration. see "shl r2,1" above as for why we do this
383%else ; %3 == X
384    add           r2, 2
385%endif ; %3 ==/!= X
386    jl .loop
387%ifnidn %3, X
388    REP_RET
389%else ; %3 == X
390%ifdef ARCH_X86_64
391    pop          r12
392    RET
393%else ; x86-32
394    REP_RET
395%endif ; x86-32/64
396%endif ; %3 ==/!= X
397%endmacro
398
399; SCALE_FUNCS source_width, intermediate_nbits, opt, n_xmm
400%macro SCALE_FUNCS 4
401SCALE_FUNC %1, %2, 4, 4,  %3, 6, %4
402SCALE_FUNC %1, %2, 8, 8,  %3, 6, %4
403%if mmsize == 8
404SCALE_FUNC %1, %2, X, X,  %3, 7, %4
405%else
406SCALE_FUNC %1, %2, X, X4, %3, 7, %4
407SCALE_FUNC %1, %2, X, X8, %3, 7, %4
408%endif
409%endmacro
410
411; SCALE_FUNCS2 opt, 8_xmm_args, 9to10_xmm_args, 16_xmm_args
412%macro SCALE_FUNCS2 4
413%ifnidn %1, sse4
414SCALE_FUNCS  8, 15, %1, %2
415SCALE_FUNCS  9, 15, %1, %3
416SCALE_FUNCS 10, 15, %1, %3
417SCALE_FUNCS 16, 15, %1, %4
418%endif ; !sse4
419SCALE_FUNCS  8, 19, %1, %2
420SCALE_FUNCS  9, 19, %1, %3
421SCALE_FUNCS 10, 19, %1, %3
422SCALE_FUNCS 16, 19, %1, %4
423%endmacro
424
425%ifdef ARCH_X86_32
426INIT_MMX
427SCALE_FUNCS2 mmx,   0, 0, 0
428%endif
429INIT_XMM
430SCALE_FUNCS2 sse2,  6, 7, 8
431SCALE_FUNCS2 ssse3, 6, 6, 8
432SCALE_FUNCS2 sse4,  6, 6, 8
433