1;
2; Copyright (C) 2009-2010 David McPaul
3;
4; All rights reserved. Distributed under the terms of the MIT License.
5;
6
7; A rather unoptimised set of sse yuv to rgb converters
8; does 4 pixels per loop
9
10; inputer:
11; reads 128 bits of yuv 8 bit data and puts
12; the y values converted to 16 bit in mm0
13; the u values converted to 16 bit and duplicated into mm1
14; the v values converted to 16 bit and duplicated into mm2
15
16; conversion:
17; does the yuv to rgb conversion using 16 bit fixed point and the
18; results are placed into the following registers as 8 bit clamped values
19; r values in mm3
20; g values in mm4
21; b values in mm5
22
23; outputer:
24; writes out the rgba pixels as 8 bit values with 0 for alpha
25
26; mm6 used for scratch
27; mm7 used for scratch
28
29%macro  cglobal 1
30	global  _%1
31	%define %1 _%1
32	align 16
33%1:
34%endmacro
35
36; conversion code 
37%macro yuv2rgbsse 0
38; u = u - 128
39; v = v - 128
40; r = y + v + v >> 2 + v >> 3 + v >> 5 
41; g = y - (u >> 2 + u >> 4 + u >> 5) - (v >> 1 + v >> 3 + v >> 4 + v >> 5)
42; b = y + u + u >> 1 + u >> 2 + u >> 6
43; subtract 16 from y
44	movq mm7, [Const16]				; loads a constant using data cache (slower on first fetch but then cached)
45;	psubsw mm0,mm7					; y = y - 16
46; subtract 128 from u and v
47	movq mm7, [Const128]			; loads a constant using data cache (slower on first fetch but then cached)
48	psubsw mm1,mm7					; u = u - 128
49	psubsw mm2,mm7					; v = v - 128
50; load r,g,b with y 
51	movq mm3,mm0					; r = y 
52	pshufw mm5,mm0, 0xE4			; b = y 
53
54; r = r + v + v >> 2 + v >> 3 + v >> 5
55	paddsw mm3, mm2					; add v to r
56	movq mm7, mm1					; move u to scratch
57	pshufw mm6, mm2, 0xE4			; move v to scratch
58	
59	psraw  mm6,2					; divide v by 4
60	paddsw mm3, mm6					; and add to r
61	psraw  mm6,1					; divide v by 2
62	paddsw mm3, mm6					; and add to r
63	psraw  mm6,2					; divide v by 4
64	paddsw mm3, mm6					; and add to r
65
66; b = y + u + u >> 1 + u >> 2 + u >> 6
67	paddsw mm5, mm1					; add u to b
68	psraw  mm7,1					; divide u by 2
69	paddsw mm5, mm7					; and add to b
70	psraw  mm7,1					; divide u by 2
71	paddsw mm5, mm7					; and add to b
72	psraw  mm7,4					; divide u by 32
73	paddsw mm5, mm7					; and add to b
74	
75; g = y - u >> 2 - u >> 4 - u >> 5 - v >> 1 - v >> 3 - v >> 4 - v >> 5
76	movq mm7,mm2					; move v to scratch
77	pshufw mm6,mm1, 0xE4			; move u to scratch
78	movq mm4,mm0					; g = y 
79	
80	psraw  mm6,2					; divide u by 4
81	psubsw mm4,mm6					; subtract from g
82	psraw  mm6,2					; divide u by 4
83	psubsw mm4,mm6					; subtract from g
84	psraw  mm6,1					; divide u by 2
85	psubsw mm4,mm6					; subtract from g
86
87	psraw  mm7,1					; divide v by 2
88	psubsw mm4,mm7					; subtract from g
89	psraw  mm7,2					; divide v by 4
90	psubsw mm4,mm7					; subtract from g
91	psraw  mm7,1					; divide v by 2
92	psubsw mm4,mm7					; subtract from g
93	psraw  mm7,1					; divide v by 2
94	psubsw mm4,mm7					; subtract from g
95%endmacro
96
97; outputer
98%macro rgba32sseoutput 0
99; clamp values
100	pxor mm7,mm7
101	packuswb mm3,mm7				; clamp to 0,255 and pack R to 8 bit per pixel
102	packuswb mm4,mm7				; clamp to 0,255 and pack G to 8 bit per pixel
103	packuswb mm5,mm7				; clamp to 0,255 and pack B to 8 bit per pixel
104; convert to bgra32 packed
105	punpcklbw mm5,mm4				; bgbgbgbgbgbgbgbg
106	movq mm0, mm5					; save bg values
107	punpcklbw mm3,mm7				; r0r0r0r0
108	punpcklwd mm5,mm3				; lower half bgr0bgr0
109	punpckhwd mm0,mm3				; upper half bgr0bgr0
110; write to output ptr
111	movq [edi], mm5					; output first 2 pixels 
112	movq [edi+8], mm0				; output second 2 pixels 
113%endmacro
114
115SECTION .data align=16
116
117Const16	dw	16
118	dw	16
119	dw	16
120	dw	16
121	dw	16
122	dw	16
123	dw	16
124	dw	16
125
126Const128	dw	128
127	dw	128
128	dw	128
129	dw	128
130	dw	128
131	dw	128
132	dw	128
133	dw	128
134
135; Packed Convert
136; void Convert_YUV422_RGBA32_SSE(void *fromPtr, void *toPtr, int width)
137%define width   ebp+16
138%define toPtr   ebp+12
139%define fromPtr ebp+8
140
141; Planar Convert
142; void Convert_YUV420P_RGBA32_SSE(void *fromYPtr, void *fromUPtr, void *fromVPtr, void *toPtr, int width)
143%define width1   ebp+24
144%define toPtr1   ebp+20
145%define fromVPtr ebp+16
146%define fromUPtr ebp+12
147%define fromYPtr ebp+8
148
149SECTION .text align=16
150
151; YUY2 FOURCC
152cglobal Convert_YUV422_RGBA32_SSE
153; reserve variables
154	push ebp
155	mov ebp, esp
156	push edi
157	push esi
158	push ecx
159	
160	mov esi, [fromPtr]
161	mov ecx, [width]
162	mov edi, [toPtr]
163; loop width / 4 times
164	shr ecx,2
165	test ecx,ecx
166	jng ENDLOOP2
167REPEATLOOP2:						; loop over width / 4
168
169; YUV422 packed inputer
170	movq mm0, [esi]					; should have yuyv yuyv
171	pshufw mm1, mm0, 0xE4			; copy to mm1
172	movq mm2, mm0					; copy to mm2
173; extract y
174	pxor mm7,mm7					; 0000000000000000
175	pcmpeqb mm6,mm6					; ffffffffffffffff
176	punpckhbw mm6,mm7				; interleave mm7 into mm6 ff00ff00ff00ff00
177	pand mm0, mm6					; clear all but y values leaving y0y0 etc
178; extract u and duplicate so each u in yuyv becomes 0u0u
179	psrld mm6,8						; 00ff0000 00ff0000 
180	pand mm1, mm6					; clear all yv values leaving 0u00 etc
181	psrld mm1,8						; rotate u to get u000
182	pshufw mm1,mm1, 0xA0			; copy u values	to get u0u0		(SSE not MMX)
183; extract v
184	pslld mm6,16					; 000000ff000000ff
185	pand mm2, mm6					; clear all yu values leaving 000v etc
186	psrld mm2,8						; rotate v to get 00v0
187	pshufw mm2,mm2, 0xF5			; copy v values to get v0v0    (SSE not MMX)
188
189yuv2rgbsse
190
191rgba32sseoutput
192
193	; endloop
194	add edi,16
195	add esi,8
196	sub ecx, 1						; apparently sub is better than dec
197	jnz REPEATLOOP2
198ENDLOOP2:
199; Cleanup
200	emms							; reset mmx regs back to float
201	pop ecx
202	pop esi
203	pop edi
204	mov esp, ebp
205	pop ebp
206	ret
207
208cglobal Convert_YUV420P_RGBA32_SSE
209; reserve variables
210	push ebp
211	mov ebp, esp
212	push edi
213	push esi
214	push ecx
215	push eax
216	push ebx
217		
218	mov esi, [fromYPtr]
219	mov eax, [fromUPtr]
220	mov ebx, [fromVPtr]
221	mov edi, [toPtr1]
222	mov ecx, [width1]
223; loop width / 4 times
224	shr ecx,2
225	test ecx,ecx
226	jng ENDLOOP3
227REPEATLOOP3:						; loop over width / 4
228; YUV420 Planar inputer
229	movq mm0, [esi]					; fetch 4 y values (8 bit) yyyy0000
230	movd mm1, [eax]					; fetch 2 u values (8 bit) uu000000
231	movd mm2, [ebx]					; fetch 2 v values (8 bit) vv000000
232	
233; extract y
234	pxor mm7,mm7					; 0000000000000000
235	punpcklbw mm0,mm7				; interleave xmm7 into xmm0 y0y0y0y
236; extract u and duplicate so each becomes 0u0u
237	punpcklbw mm1,mm7				; interleave xmm7 into xmm1 u0u00000
238	punpcklwd mm1,mm7				; interleave again u000u000
239	pshufw mm1,mm1, 0xA0			; copy u values to get u0u0
240; extract v
241	punpcklbw mm2,mm7				; interleave xmm7 into xmm1 v0v00000
242	punpcklwd mm2,mm7				; interleave again v000v000
243	pshufw mm2,mm2, 0xA0			; copy v values to get v0v0
244
245yuv2rgbsse
246	
247rgba32sseoutput
248
249; endloop
250	add edi,16
251	add esi,4
252	add eax,2
253	add ebx,2
254	sub ecx, 1				; apparently sub is better than dec
255	jnz REPEATLOOP3
256ENDLOOP3:
257; Cleanup
258	emms
259	pop ebx
260	pop eax
261	pop ecx
262	pop esi
263	pop edi
264	mov esp, ebp
265	pop ebp
266	ret
267
268SECTION .note.GNU-stack noalloc noexec nowrite progbits
269