• Home
  • History
  • Annotate
  • Line#
  • Navigate
  • Raw
  • Download
  • only in /netgear-R7000-V1.0.7.12_1.2.5/components/opensource/linux/linux-2.6.36/arch/m68knommu/platform/520x/
1;******************************************************************************
2;* x86-optimized vertical line scaling functions
3;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
4;*                    Kieran Kunhya <kieran@kunhya.com>
5;*
6;* This file is part of Libav.
7;*
8;* Libav is free software; you can redistribute it and/or
9;* modify it under the terms of the GNU Lesser General Public
10;* License as published by the Free Software Foundation; either
11;* version 2.1 of the License, or (at your option) any later version.
12;*
13;* Libav is distributed in the hope that it will be useful,
14;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16;* Lesser General Public License for more details.
17;*
18;* You should have received a copy of the GNU Lesser General Public
19;* License along with Libav; if not, write to the Free Software
20;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21;******************************************************************************
22
23%include "x86inc.asm"
24%include "x86util.asm"
25
26SECTION_RODATA
27
28minshort:      times 8 dw 0x8000
29yuv2yuvX_16_start:  times 4 dd 0x4000 - 0x40000000
30yuv2yuvX_10_start:  times 4 dd 0x10000
31yuv2yuvX_9_start:   times 4 dd 0x20000
32yuv2yuvX_10_upper:  times 8 dw 0x3ff
33yuv2yuvX_9_upper:   times 8 dw 0x1ff
34pd_4:          times 4 dd 4
35pd_4min0x40000:times 4 dd 4 - (0x40000)
36pw_16:         times 8 dw 16
37pw_32:         times 8 dw 32
38pw_512:        times 8 dw 512
39pw_1024:       times 8 dw 1024
40
41SECTION .text
42
43;-----------------------------------------------------------------------------
44; vertical line scaling
45;
46; void yuv2plane1_<output_size>_<opt>(const int16_t *src, uint8_t *dst, int dstW,
47;                                     const uint8_t *dither, int offset)
48; and
49; void yuv2planeX_<output_size>_<opt>(const int16_t *filter, int filterSize,
50;                                     const int16_t **src, uint8_t *dst, int dstW,
51;                                     const uint8_t *dither, int offset)
52;
53; Scale one or $filterSize lines of source data to generate one line of output
54; data. The input is 15-bit in int16_t if $output_size is [8,10] and 19-bit in
55; int32_t if $output_size is 16. $filter is 12-bits. $filterSize is a multiple
56; of 2. $offset is either 0 or 3. $dither holds 8 values.
57;-----------------------------------------------------------------------------
58
59%macro yuv2planeX_fn 3
60
61%ifdef ARCH_X86_32
62%define cntr_reg r1
63%define movsx mov
64%else
65%define cntr_reg r11
66%define movsx movsxd
67%endif
68
69cglobal yuv2planeX_%1, %3, 7, %2
70%if %1 == 8 || %1 == 9 || %1 == 10
71    pxor            m6,  m6
72%endif ; %1 == 8/9/10
73
74%if %1 == 8
75%ifdef ARCH_X86_32
76%assign pad 0x2c - (stack_offset & 15)
77    SUB             rsp, pad
78%define m_dith m7
79%else ; x86-64
80%define m_dith m9
81%endif ; x86-32
82
83    ; create registers holding dither
84    movq        m_dith, [r5]             ; dither
85    test            r6d, r6d
86    jz              .no_rot
87%if mmsize == 16
88    punpcklqdq  m_dith,  m_dith
89%endif ; mmsize == 16
90    PALIGNR     m_dith,  m_dith,  3,  m0
91.no_rot:
92%if mmsize == 16
93    punpcklbw   m_dith,  m6
94%ifdef ARCH_X86_64
95    punpcklwd       m8,  m_dith,  m6
96    pslld           m8,  12
97%else ; x86-32
98    punpcklwd       m5,  m_dith,  m6
99    pslld           m5,  12
100%endif ; x86-32/64
101    punpckhwd   m_dith,  m6
102    pslld       m_dith,  12
103%ifdef ARCH_X86_32
104    mova      [rsp+ 0],  m5
105    mova      [rsp+16],  m_dith
106%endif
107%else ; mmsize == 8
108    punpcklbw       m5,  m_dith,  m6
109    punpckhbw   m_dith,  m6
110    punpcklwd       m4,  m5,  m6
111    punpckhwd       m5,  m6
112    punpcklwd       m3,  m_dith,  m6
113    punpckhwd   m_dith,  m6
114    pslld           m4,  12
115    pslld           m5,  12
116    pslld           m3,  12
117    pslld       m_dith,  12
118    mova      [rsp+ 0],  m4
119    mova      [rsp+ 8],  m5
120    mova      [rsp+16],  m3
121    mova      [rsp+24],  m_dith
122%endif ; mmsize == 8/16
123%endif ; %1 == 8
124
125    xor             r5,  r5
126
127.pixelloop:
128%assign %%i 0
129    ; the rep here is for the 8bit output mmx case, where dither covers
130    ; 8 pixels but we can only handle 2 pixels per register, and thus 4
131    ; pixels per iteration. In order to not have to keep track of where
132    ; we are w.r.t. dithering, we unroll the mmx/8bit loop x2.
133%if %1 == 8
134%rep 16/mmsize
135%endif ; %1 == 8
136
137%if %1 == 8
138%ifdef ARCH_X86_32
139    mova            m2, [rsp+mmsize*(0+%%i)]
140    mova            m1, [rsp+mmsize*(1+%%i)]
141%else ; x86-64
142    mova            m2,  m8
143    mova            m1,  m_dith
144%endif ; x86-32/64
145%else ; %1 == 9/10/16
146    mova            m1, [yuv2yuvX_%1_start]
147    mova            m2,  m1
148%endif ; %1 == 8/9/10/16
149    movsx     cntr_reg,  r1m
150.filterloop_ %+ %%i:
151    ; input pixels
152    mov             r6, [r2+gprsize*cntr_reg-2*gprsize]
153%if %1 == 16
154    mova            m3, [r6+r5*4]
155    mova            m5, [r6+r5*4+mmsize]
156%else ; %1 == 8/9/10
157    mova            m3, [r6+r5*2]
158%endif ; %1 == 8/9/10/16
159    mov             r6, [r2+gprsize*cntr_reg-gprsize]
160%if %1 == 16
161    mova            m4, [r6+r5*4]
162    mova            m6, [r6+r5*4+mmsize]
163%else ; %1 == 8/9/10
164    mova            m4, [r6+r5*2]
165%endif ; %1 == 8/9/10/16
166
167    ; coefficients
168    movd            m0, [r0+2*cntr_reg-4]; coeff[0], coeff[1]
169%if %1 == 16
170    pshuflw         m7,  m0,  0          ; coeff[0]
171    pshuflw         m0,  m0,  0x55       ; coeff[1]
172    pmovsxwd        m7,  m7              ; word -> dword
173    pmovsxwd        m0,  m0              ; word -> dword
174
175    pmulld          m3,  m7
176    pmulld          m5,  m7
177    pmulld          m4,  m0
178    pmulld          m6,  m0
179
180    paddd           m2,  m3
181    paddd           m1,  m5
182    paddd           m2,  m4
183    paddd           m1,  m6
184%else ; %1 == 10/9/8
185    punpcklwd       m5,  m3,  m4
186    punpckhwd       m3,  m4
187    SPLATD          m0,  m0
188
189    pmaddwd         m5,  m0
190    pmaddwd         m3,  m0
191
192    paddd           m2,  m5
193    paddd           m1,  m3
194%endif ; %1 == 8/9/10/16
195
196    sub       cntr_reg,  2
197    jg .filterloop_ %+ %%i
198
199%if %1 == 16
200    psrad           m2,  31 - %1
201    psrad           m1,  31 - %1
202%else ; %1 == 10/9/8
203    psrad           m2,  27 - %1
204    psrad           m1,  27 - %1
205%endif ; %1 == 8/9/10/16
206
207%if %1 == 8
208    packssdw        m2,  m1
209    packuswb        m2,  m2
210    movh     [r3+r5*1],  m2
211%else ; %1 == 9/10/16
212%if %1 == 16
213    packssdw        m2,  m1
214    paddw           m2, [minshort]
215%else ; %1 == 9/10
216%if cpuflag(sse4)
217    packusdw        m2,  m1
218%else ; mmx2/sse2
219    packssdw        m2,  m1
220    pmaxsw          m2,  m6
221%endif ; mmx2/sse2/sse4/avx
222    pminsw          m2, [yuv2yuvX_%1_upper]
223%endif ; %1 == 9/10/16
224    mova     [r3+r5*2],  m2
225%endif ; %1 == 8/9/10/16
226
227    add             r5,  mmsize/2
228    sub             r4d, mmsize/2
229%if %1 == 8
230%assign %%i %%i+2
231%endrep
232%endif ; %1 == 8
233    jg .pixelloop
234
235%if %1 == 8
236%ifdef ARCH_X86_32
237    ADD             rsp, pad
238    RET
239%else ; x86-64
240    REP_RET
241%endif ; x86-32/64
242%else ; %1 == 9/10/16
243    REP_RET
244%endif ; %1 == 8/9/10/16
245%endmacro
246
247%define PALIGNR PALIGNR_MMX
248%ifdef ARCH_X86_32
249INIT_MMX mmx2
250yuv2planeX_fn  8,  0, 7
251yuv2planeX_fn  9,  0, 5
252yuv2planeX_fn 10,  0, 5
253%endif
254
255INIT_XMM sse2
256yuv2planeX_fn  8, 10, 7
257yuv2planeX_fn  9,  7, 5
258yuv2planeX_fn 10,  7, 5
259
260%define PALIGNR PALIGNR_SSSE3
261INIT_XMM sse4
262yuv2planeX_fn  8, 10, 7
263yuv2planeX_fn  9,  7, 5
264yuv2planeX_fn 10,  7, 5
265yuv2planeX_fn 16,  8, 5
266
267INIT_XMM avx
268yuv2planeX_fn  8, 10, 7
269yuv2planeX_fn  9,  7, 5
270yuv2planeX_fn 10,  7, 5
271
272; %1=outout-bpc, %2=alignment (u/a)
273%macro yuv2plane1_mainloop 2
274.loop_%2:
275%if %1 == 8
276    paddsw          m0, m2, [r0+r2*2+mmsize*0]
277    paddsw          m1, m3, [r0+r2*2+mmsize*1]
278    psraw           m0, 7
279    psraw           m1, 7
280    packuswb        m0, m1
281    mov%2      [r1+r2], m0
282%elif %1 == 16
283    paddd           m0, m4, [r0+r2*4+mmsize*0]
284    paddd           m1, m4, [r0+r2*4+mmsize*1]
285    paddd           m2, m4, [r0+r2*4+mmsize*2]
286    paddd           m3, m4, [r0+r2*4+mmsize*3]
287    psrad           m0, 3
288    psrad           m1, 3
289    psrad           m2, 3
290    psrad           m3, 3
291%if cpuflag(sse4) ; avx/sse4
292    packusdw        m0, m1
293    packusdw        m2, m3
294%else ; mmx/sse2
295    packssdw        m0, m1
296    packssdw        m2, m3
297    paddw           m0, m5
298    paddw           m2, m5
299%endif ; mmx/sse2/sse4/avx
300    mov%2    [r1+r2*2], m0
301    mov%2    [r1+r2*2+mmsize], m2
302%else
303    paddsw          m0, m2, [r0+r2*2+mmsize*0]
304    paddsw          m1, m2, [r0+r2*2+mmsize*1]
305    psraw           m0, 15 - %1
306    psraw           m1, 15 - %1
307    pmaxsw          m0, m4
308    pmaxsw          m1, m4
309    pminsw          m0, m3
310    pminsw          m1, m3
311    mov%2    [r1+r2*2], m0
312    mov%2    [r1+r2*2+mmsize], m1
313%endif
314    add             r2, mmsize
315    jl .loop_%2
316%endmacro
317
318%macro yuv2plane1_fn 3
319cglobal yuv2plane1_%1, %3, %3, %2
320    add             r2, mmsize - 1
321    and             r2, ~(mmsize - 1)
322%if %1 == 8
323    add             r1, r2
324%else ; %1 != 8
325    lea             r1, [r1+r2*2]
326%endif ; %1 == 8
327%if %1 == 16
328    lea             r0, [r0+r2*4]
329%else ; %1 != 16
330    lea             r0, [r0+r2*2]
331%endif ; %1 == 16
332    neg             r2
333
334%if %1 == 8
335    pxor            m4, m4               ; zero
336
337    ; create registers holding dither
338    movq            m3, [r3]             ; dither
339    test           r4d, r4d
340    jz              .no_rot
341%if mmsize == 16
342    punpcklqdq      m3, m3
343%endif ; mmsize == 16
344    PALIGNR_MMX     m3, m3, 3, m2
345.no_rot:
346%if mmsize == 8
347    mova            m2, m3
348    punpckhbw       m3, m4               ; byte->word
349    punpcklbw       m2, m4               ; byte->word
350%else
351    punpcklbw       m3, m4
352    mova            m2, m3
353%endif
354%elif %1 == 9
355    pxor            m4, m4
356    mova            m3, [pw_512]
357    mova            m2, [pw_32]
358%elif %1 == 10
359    pxor            m4, m4
360    mova            m3, [pw_1024]
361    mova            m2, [pw_16]
362%else ; %1 == 16
363%if cpuflag(sse4) ; sse4/avx
364    mova            m4, [pd_4]
365%else ; mmx/sse2
366    mova            m4, [pd_4min0x40000]
367    mova            m5, [minshort]
368%endif ; mmx/sse2/sse4/avx
369%endif ; %1 == ..
370
371    ; actual pixel scaling
372%if mmsize == 8
373    yuv2plane1_mainloop %1, a
374%else ; mmsize == 16
375    test            r1, 15
376    jnz .unaligned
377    yuv2plane1_mainloop %1, a
378    REP_RET
379.unaligned:
380    yuv2plane1_mainloop %1, u
381%endif ; mmsize == 8/16
382    REP_RET
383%endmacro
384
385%ifdef ARCH_X86_32
386INIT_MMX mmx
387yuv2plane1_fn  8, 0, 5
388yuv2plane1_fn 16, 0, 3
389
390INIT_MMX mmx2
391yuv2plane1_fn  9, 0, 3
392yuv2plane1_fn 10, 0, 3
393%endif
394
395INIT_XMM sse2
396yuv2plane1_fn  8, 5, 5
397yuv2plane1_fn  9, 5, 3
398yuv2plane1_fn 10, 5, 3
399yuv2plane1_fn 16, 6, 3
400
401INIT_XMM sse4
402yuv2plane1_fn 16, 5, 3
403
404INIT_XMM avx
405yuv2plane1_fn  8, 5, 5
406yuv2plane1_fn  9, 5, 3
407yuv2plane1_fn 10, 5, 3
408yuv2plane1_fn 16, 5, 3
409