1;
2; Copyright (C) 2009-2010 David McPaul
3;
4; All rights reserved. Distributed under the terms of the MIT License.
5;
6
7; A rather unoptimised set of sse2 yuv to rgb converters
8; does 8 pixels per loop
9
10; inputer:
11; reads 128 bits of yuv 8 bit data and puts
12; the y values converted to 16 bit in xmm0
13; the u values converted to 16 bit and duplicated into xmm1
14; the v values converted to 16 bit and duplicated into xmm2
15
16; conversion:
17; does the yuv to rgb conversion using 16 bit fixed point and the
18; results are placed into the following registers as 8 bit clamped values
19; r values in xmm3
20; g values in xmm4
21; b values in xmm5
22
23; outputer:
24; writes out the rgba pixels as 8 bit values with 0 for alpha
25
26; xmm6 used for scratch
27; xmm7 used for scratch
28
29%macro  cglobal 1
30	global  _%1
31	%define %1 _%1
32	align 16
33%1:
34%endmacro
35
36SECTION .data align=16
37
38Const16	dw	16
39	dw	16
40	dw	16
41	dw	16
42	dw	16
43	dw	16
44	dw	16
45	dw	16
46
47Const128	dw	128
48	dw	128
49	dw	128
50	dw	128
51	dw	128
52	dw	128
53	dw	128
54	dw	128
55	
56RConst	dw 0
57		dw 5743
58		dw 0
59		dw 5743
60		dw 0
61		dw 5743
62		dw 0
63		dw 5743
64		
65GConst	dw -1409
66		dw -2925
67		dw -1409
68		dw -2925
69		dw -1409
70		dw -2925
71		dw -1409
72		dw -2925
73		
74BConst	dw 7258
75		dw 0
76		dw 7258
77		dw 0
78		dw 7258
79		dw 0
80		dw 7258
81		dw 0
82
83shuffconst db 0x0
84		db 0x01
85		db 0x00
86		db 0x01
87		db 0x04
88		db 0x05
89		db 0x04
90		db 0x05
91		db 0x08
92		db 0x09
93		db 0x08
94		db 0x09
95		db 0x0c
96		db 0x0d
97		db 0x0c
98		db 0x0d
99		
100YMask	db	0x00
101	db	0x80
102	db	0x02
103	db	0x80
104	db	0x04
105	db	0x80
106	db	0x06
107	db	0x80
108	db	0x08
109	db	0x80
110	db	0x0a
111	db	0x80
112	db	0x0c
113	db	0x80
114	db	0x0e
115	db	0x80
116
117UVMask	db	0x01
118	db	0x80
119	db	0x03
120	db	0x80
121	db	0x05
122	db	0x80
123	db	0x07
124	db	0x80
125	db	0x09
126	db	0x80
127	db	0x0b
128	db	0x80
129	db	0x0d
130	db	0x80
131	db	0x0f
132	db	0x80
133
134; conversion code 
135%macro yuv2rgbsse2 0
136; u = u - 128
137; v = v - 128
138; r = y + 0 * u + 1.402 * v
139; g = y + -0.344 * u + -0.714 * v
140; b = y + 1.772 * u + 0 * v
141; subtract 16 from y
142;	psubsw xmm0, [Const16]			; y = y - 16
143; subtract 128 from u and v
144	psubsw xmm3, [Const128]			; u = u - 128, v = v -128
145	
146	movdqa xmm4, xmm3				; duplicate
147	pshufd xmm5, xmm3, 0xE4			; duplicate
148	
149	pmaddwd xmm3, [RConst]			; multiply and add
150	pmaddwd xmm4, [GConst]			; to get RGB offsets to Y
151	pmaddwd xmm5, [BConst]			;
152
153	psrad xmm3, 12					; Scale back to original range
154	psrad xmm4, 12					;
155	psrad xmm5, 12					;
156	
157	pshuflw xmm3, xmm3, 0xa0		; duplicate results
158	pshufhw xmm3, xmm3, 0xa0
159	pshuflw xmm4, xmm4, 0xa0
160	pshufhw xmm4, xmm4, 0xa0
161	pshuflw xmm5, xmm5, 0xa0
162	pshufhw xmm5, xmm5, 0xa0
163	
164	paddsw xmm3, xmm0				; add to y
165	paddsw xmm4, xmm0				;
166	paddsw xmm5, xmm0				;
167%endmacro
168
169; outputer
170%macro rgba32sse2output 0
171; clamp values
172	pxor xmm7,xmm7
173	packuswb xmm3,xmm7				; clamp to 0,255 and pack R to 8 bit per pixel
174	packuswb xmm4,xmm7				; clamp to 0,255 and pack G to 8 bit per pixel
175	packuswb xmm5,xmm7				; clamp to 0,255 and pack B to 8 bit per pixel
176; convert to bgra32 packed
177	punpcklbw xmm5,xmm4				; bgbgbgbgbgbgbgbg
178	movdqa xmm0, xmm5				; save bg values
179	punpcklbw xmm3,xmm7				; r0r0r0r0r0r0r0r0
180	punpcklwd xmm5,xmm3				; lower half bgr0bgr0bgr0bgr0
181	punpckhwd xmm0,xmm3				; upper half bgr0bgr0bgr0bgr0
182; write to output ptr
183	movntdq [edi], xmm5				; output first 4 pixels bypassing cache
184	movntdq [edi+16], xmm0			; output second 4 pixels bypassing cache
185%endmacro
186
187; void Convert_YUV422_RGBA32_SSE2(void *fromPtr, void *toPtr, int width)
188%define width   ebp+16
189%define toPtr   ebp+12
190%define fromPtr ebp+8
191
192; void Convert_YUV420P_RGBA32_SSE2(void *fromYPtr, void *fromUPtr, void *fromVPtr, void *toPtr, int width)
193%define width1   ebp+24
194%define toPtr1   ebp+20
195%define fromVPtr ebp+16
196%define fromUPtr ebp+12
197%define fromYPtr ebp+8
198
199SECTION .text align=16
200
201cglobal Convert_YUV422_RGBA32_SSE2
202; reserve variables
203	push ebp
204	mov ebp, esp
205	push edi
206	push esi
207	push ecx
208	
209	mov esi, [fromPtr]
210	mov edi, [toPtr]
211	mov ecx, [width]
212; loop width / 8 times
213	shr ecx,3
214	test ecx,ecx
215	jng ENDLOOP
216REPEATLOOP:							; loop over width / 8
217	prefetchnta [esi+256]
218; YUV422 packed inputer
219	movdqa xmm0, [esi]				; should have yuyv yuyv yuyv yuyv
220	pshufd xmm3, xmm0, 0xE4			; copy to xmm3
221; extract y
222	pxor xmm7, xmm7					; 00000000000000000000000000000000
223	pcmpeqd xmm6, xmm6				; ffffffffffffffffffffffffffffffff
224	punpcklbw xmm6, xmm7			; interleave xmm7 into xmm6 ff00ff00ff00ff00ff00ff00ff00ff00
225	pand xmm0, xmm6					; clear all but y values leaving y0y0 etc
226; extract u and v
227	psllw xmm6, 8					; 00ff00ff00ff00ff00ff00ff00ff00ff
228	pand xmm3, xmm6					; extract uv values 0u0v0u0v0u0v0u0v0u0v
229	psrlw xmm3, 8					; covert to 16bit   u0v0u0v0u0v0u0v0u0v0
230	
231yuv2rgbsse2
232	
233rgba32sse2output
234
235; endloop
236	add edi,32
237	add esi,16
238	sub ecx, 1				; apparently sub is better than dec
239	jnz REPEATLOOP
240ENDLOOP:
241; Cleanup
242	pop ecx
243	pop esi
244	pop edi
245	mov esp, ebp
246	pop ebp
247	ret
248
249cglobal Convert_YUV420P_RGBA32_SSE2
250; reserve variables
251	push ebp
252	mov ebp, esp
253	push edi
254	push esi
255	push ecx
256	push eax
257	push ebx
258		
259	mov esi, [fromYPtr]
260	mov eax, [fromUPtr]
261	mov ebx, [fromVPtr]
262	mov edi, [toPtr1]
263	mov ecx, [width1]
264; loop width / 8 times
265	shr ecx,3
266	test ecx,ecx
267	jng ENDLOOP1
268REPEATLOOP1:						; loop over width / 8
269; YUV420 Planar inputer
270	movq xmm0, [esi]				; fetch 8 y values (8 bit) yyyyyyyy00000000
271	movd xmm3, [eax]				; fetch 4 u values (8 bit) uuuu000000000000
272	movd xmm1, [ebx]				; fetch 4 v values (8 bit) vvvv000000000000
273	
274; extract y
275	pxor xmm7, xmm7					; 00000000000000000000000000000000
276	punpcklbw xmm0, xmm7			; interleave xmm7 into xmm0 y0y0y0y0y0y0y0y0
277	
278; combine u and v
279	punpcklbw xmm3, xmm1			; uvuvuvuv00000000
280	punpcklbw xmm3, xmm7			; u0v0u0v0u0v0u0v0
281
282yuv2rgbsse2
283	
284rgba32sse2output
285
286; endloop
287	add edi,32
288	add esi,8
289	add eax,4
290	add ebx,4
291	sub ecx, 1				; apparently sub is better than dec
292	jnz REPEATLOOP1
293ENDLOOP1:
294; Cleanup
295	pop ebx
296	pop eax
297	pop ecx
298	pop esi
299	pop edi
300	mov esp, ebp
301	pop ebp
302	ret
303
304cglobal Test_SSE2
305; reserve variables
306	push ebp
307	mov ebp, esp
308	push edi
309	push esi
310	push ecx
311	push eax
312	push ebx
313	
314	mov esi, [fromPtr]
315	mov edi, [toPtr]
316
317	movdqa xmm0, [esi]				; should have yuyv yuyv yuyv yuyv
318	pshufd xmm1, xmm0, 0xE4			; copy to xmm1
319	movdqa xmm3, xmm0				; copy to xmm2
320; extract y
321	pxor xmm7,xmm7					; 00000000000000000000000000000000
322	pcmpeqd xmm6,xmm6				; ffffffffffffffffffffffffffffffff
323	punpcklbw xmm6,xmm7				; interleave xmm7 into xmm6 ff00ff00ff00ff00ff00ff00ff00ff00
324	pand xmm0, xmm6					; clear all but y values leaving y0y0 etc
325; extract u and duplicate so each u in yuyv becomes 0u0u
326	psrld xmm6,8					; 00ff0000 00ff0000 00ff0000 00ff0000
327	pand xmm1, xmm6					; clear all yv values leaving 0u00 etc
328	psrld xmm1,8					; rotate u to get u000
329; extract v
330	pslld xmm6,16					; 000000ff000000ff 000000ff000000ff
331	pand xmm3, xmm6					; clear all yu values leaving 000v etc
332	psrld xmm3,8					; rotate v to get 00v0
333	por xmm3, xmm1
334
335	psubsw xmm3, [Const128]			; u = u - 128, v = v -128
336
337	pmaddwd xmm3, [RConst]			; multiply and add
338	psrad xmm3, 12					; Scale back to original range
339
340	pshufb xmm3, [shuffconst]		; duplicate results
341;	paddsw xmm3, xmm0				; add to y
342
343;	pxor xmm7,xmm7
344;	packuswb xmm3,xmm7				; clamp to 0,255 and pack R to 8 bit per pixel
345
346	movntdq [edi], xmm3				; output first 4 pixels bypassing cache
347
348; Cleanup
349	pop ebx
350	pop eax
351	pop ecx
352	pop esi
353	pop edi
354	mov esp, ebp
355	pop ebp
356	ret
357	
358SECTION .note.GNU-stack noalloc noexec nowrite progbits
359