1;
2; Copyright (C) 2009-2010 David McPaul
3;
4; All rights reserved. Distributed under the terms of the MIT License.
5;
6
7; A rather unoptimised set of yuv to rgb converters
8; does 8 pixels at a time
9
10; inputer:
11; reads 128bits of yuv 8 bit data and puts
12; the y values converted to 16 bit in xmm0
13; the u values converted to 16 bit and duplicated into xmm1
14; the v values converted to 16 bit and duplicated into xmm2
15
16; conversion:
17; does the yuv to rgb conversion using 16 bit fixed point and the
18; results are placed into the following registers as 8 bit clamped values
19; r values in xmm3
20; g values in xmm4
21; b values in xmm5
22
23; outputer:
24; writes out the rgba pixels as 8 bit values with 0 for alpha
25
26; xmm6 used for scratch
27; xmm7 used for scratch
28
29%macro  cglobal 1
30	global  _%1
31	%define %1 _%1
32	align 16
33%1:
34%endmacro
35
36; conversion code 
37%macro yuv2rgbsse2 0
38; u = u - 128
39; v = v - 128
40; r = y + v + v >> 2 + v >> 3 + v >> 5 
41; g = y - (u >> 2 + u >> 4 + u >> 5) - (v >> 1 + v >> 3 + v >> 4 + v >> 5)
42; b = y + u + u >> 1 + u >> 2 + u >> 6
43; subtract 16 from y
44	movdqa xmm7, [Const16]			; loads a constant using data cache (slower on first fetch but then cached)
45	psubsw xmm0,xmm7				; y = y - 16
46; subtract 128 from u and v
47	movdqa xmm7, [Const128]			; loads a constant using data cache (slower on first fetch but then cached)
48	psubsw xmm1,xmm7				; u = u - 128
49	psubsw xmm2,xmm7				; v = v - 128
50; load r,b with y 
51	movdqa xmm3,xmm0				; r = y 
52	pshufd xmm5,xmm0, 0xE4			; b = y 
53
54; r = y + v + v >> 2 + v >> 3 + v >> 5
55	paddsw xmm3, xmm2				; add v to r
56	movdqa xmm7, xmm1				; move u to scratch
57	pshufd xmm6, xmm2, 0xE4			; move v to scratch
58	
59	psraw  xmm6,2					; divide v by 4
60	paddsw xmm3, xmm6				; and add to r
61	psraw  xmm6,1					; divide v by 2
62	paddsw xmm3, xmm6				; and add to r
63	psraw  xmm6,2					; divide v by 4
64	paddsw xmm3, xmm6				; and add to r
65
66; b = y + u + u >> 1 + u >> 2 + u >> 6
67	paddsw xmm5, xmm1				; add u to b
68	psraw  xmm7,1					; divide u by 2
69	paddsw xmm5, xmm7				; and add to b
70	psraw  xmm7,1					; divide u by 2
71	paddsw xmm5, xmm7				; and add to b
72	psraw  xmm7,4					; divide u by 32
73	paddsw xmm5, xmm7				; and add to b
74	
75; g = y - u >> 2 - u >> 4 - u >> 5 - v >> 1 - v >> 3 - v >> 4 - v >> 5
76	movdqa xmm7,xmm2				; move v to scratch
77	pshufd xmm6,xmm1, 0xE4			; move u to scratch
78	movdqa xmm4,xmm0				; g = y 
79	
80	psraw  xmm6,2					; divide u by 4
81	psubsw xmm4,xmm6				; subtract from g
82	psraw  xmm6,2					; divide u by 4
83	psubsw xmm4,xmm6				; subtract from g
84	psraw  xmm6,1					; divide u by 2
85	psubsw xmm4,xmm6				; subtract from g
86
87	psraw  xmm7,1					; divide v by 2
88	psubsw xmm4,xmm7				; subtract from g
89	psraw  xmm7,2					; divide v by 4
90	psubsw xmm4,xmm7				; subtract from g
91	psraw  xmm7,1					; divide v by 2
92	psubsw xmm4,xmm7				; subtract from g
93	psraw  xmm7,1					; divide v by 2
94	psubsw xmm4,xmm7				; subtract from g
95%endmacro
96
97; conversion code 
98%macro yuv2rgbsse 0
99; u = u - 128
100; v = v - 128
101; r = y + v + v >> 2 + v >> 3 + v >> 5 
102; g = y - (u >> 2 + u >> 4 + u >> 5) - (v >> 1 + v >> 3 + v >> 4 + v >> 5)
103; b = y + u + u >> 1 + u >> 2 + u >> 6
104; subtract 16 from y
105	movq mm7, [Const16]				; loads a constant using data cache (slower on first fetch but then cached)
106	psubsw mm0,mm7					; y = y - 16
107; subtract 128 from u and v
108	movq mm7, [Const128]			; loads a constant using data cache (slower on first fetch but then cached)
109	psubsw mm1,mm7					; u = u - 128
110	psubsw mm2,mm7					; v = v - 128
111; load r,g,b with y 
112	movq mm3,mm0					; r = y 
113	pshufw mm5,mm0, 0xE4			; b = y 
114
115; r = r + v + v >> 2 + v >> 3 + v >> 5
116	paddsw mm3, mm2					; add v to r
117	movq mm7, mm1					; move u to scratch
118	pshufw mm6, mm2, 0xE4			; move v to scratch
119	
120	psraw  mm6,2					; divide v by 4
121	paddsw mm3, mm6					; and add to r
122	psraw  mm6,1					; divide v by 2
123	paddsw mm3, mm6					; and add to r
124	psraw  mm6,2					; divide v by 4
125	paddsw mm3, mm6					; and add to r
126
127; b = y + u + u >> 1 + u >> 2 + u >> 6
128	paddsw mm5, mm1					; add u to b
129	psraw  mm7,1					; divide u by 2
130	paddsw mm5, mm7					; and add to b
131	psraw  mm7,1					; divide u by 2
132	paddsw mm5, mm7					; and add to b
133	psraw  mm7,4					; divide u by 32
134	paddsw mm5, mm7					; and add to b
135	
136; g = y - u >> 2 - u >> 4 - u >> 5 - v >> 1 - v >> 3 - v >> 4 - v >> 5
137	movq mm7,mm2					; move v to scratch
138	pshufw mm6,mm1, 0xE4			; move u to scratch
139	movq mm4,mm0					; g = y 
140	
141	psraw  mm6,2					; divide u by 4
142	psubsw mm4,mm6					; subtract from g
143	psraw  mm6,2					; divide u by 4
144	psubsw mm4,mm6					; subtract from g
145	psraw  mm6,1					; divide u by 2
146	psubsw mm4,mm6					; subtract from g
147
148	psraw  mm7,1					; divide v by 2
149	psubsw mm4,mm7					; subtract from g
150	psraw  mm7,2					; divide v by 4
151	psubsw mm4,mm7					; subtract from g
152	psraw  mm7,1					; divide v by 2
153	psubsw mm4,mm7					; subtract from g
154	psraw  mm7,1					; divide v by 2
155	psubsw mm4,mm7					; subtract from g
156%endmacro
157
158; outputer
159%macro rgba32sse2output 0
160; clamp values
161	pxor xmm7,xmm7
162	packuswb xmm3,xmm7				; clamp to 0,255 and pack R to 8 bit per pixel
163	packuswb xmm4,xmm7				; clamp to 0,255 and pack G to 8 bit per pixel
164	packuswb xmm5,xmm7				; clamp to 0,255 and pack B to 8 bit per pixel
165; convert to bgra32 packed
166	punpcklbw xmm5,xmm4				; bgbgbgbgbgbgbgbg
167	movdqa xmm0, xmm5				; save bg values
168	punpcklbw xmm3,xmm7				; r0r0r0r0r0r0r0r0
169	punpcklwd xmm5,xmm3				; lower half bgr0bgr0bgr0bgr0
170	punpckhwd xmm0,xmm3				; upper half bgr0bgr0bgr0bgr0
171; write to output ptr
172	movntdq [edi], xmm5				; output first 4 pixels bypassing cache
173	movntdq [edi+16], xmm0			; output second 4 pixels bypassing cache
174%endmacro
175
176; outputer
177%macro rgba32sseoutput 0
178; clamp values
179	pxor mm7,mm7
180	packuswb mm3,mm7				; clamp to 0,255 and pack R to 8 bit per pixel
181	packuswb mm4,mm7				; clamp to 0,255 and pack G to 8 bit per pixel
182	packuswb mm5,mm7				; clamp to 0,255 and pack B to 8 bit per pixel
183; convert to bgra32 packed
184	punpcklbw mm5,mm4				; bgbgbgbgbgbgbgbg
185	movq mm0, mm5					; save bg values
186	punpcklbw mm3,mm7				; r0r0r0r0
187	punpcklwd mm5,mm3				; lower half bgr0bgr0
188	punpckhwd mm0,mm3				; upper half bgr0bgr0
189; write to output ptr
190	movq [edi], mm5					; output first 2 pixels 
191	movq [edi+8], mm0				; output second 2 pixels 
192%endmacro
193
194SECTION .data align=16
195
196Const16	dw	16
197	dw	16
198	dw	16
199	dw	16
200	dw	16
201	dw	16
202	dw	16
203	dw	16
204
205Const128	dw	128
206	dw	128
207	dw	128
208	dw	128
209	dw	128
210	dw	128
211	dw	128
212	dw	128
213
214; void Convert_YUV422_RGBA32_SSE2(void *fromPtr, void *toPtr, int width)
215%define  width    ebp+16
216%define  toPtr    ebp+12
217%define  fromPtr  ebp+8
218
219; void Convert_YUV420P_RGBA32_SSE2(void *fromYPtr, void *fromUPtr, void *fromVPtr, void *toPtr, int width)
220%define width1    ebp+24
221%define toPtr1    ebp+20
222%define fromVPtr  ebp+16
223%define fromUPtr  ebp+12
224%define fromYPtr  ebp+8
225
226SECTION .text align=16
227
228cglobal Convert_YUV422_RGBA32_SSE2
229; reserve variables
230	push ebp
231	mov ebp, esp
232	push edi
233	push esi
234	push ecx
235	
236	mov esi, [fromPtr]
237	mov edi, [toPtr]
238	mov ecx, [width]
239; loop width / 8 times
240	shr ecx,3
241	test ecx,ecx
242	jng ENDLOOP
243REPEATLOOP:							; loop over width / 8
244; YUV422 packed inputer
245	movdqa xmm0, [esi]				; should have yuyv yuyv yuyv yuyv
246	pshufd xmm1, xmm0, 0xE4			; copy to xmm1
247	movdqa xmm2, xmm0				; copy to xmm2
248; extract y
249	pxor xmm7,xmm7					; 00000000000000000000000000000000
250	pcmpeqd xmm6,xmm6				; ffffffffffffffffffffffffffffffff
251	punpcklbw xmm6,xmm7				; interleave xmm7 into xmm6 ff00ff00ff00ff00ff00ff00ff00ff00
252	pand xmm0, xmm6					; clear all but y values leaving y0y0 etc
253; extract u and duplicate so each u in yuyv becomes 0u0u
254	psrld xmm6,8					; 00ff0000 00ff0000 00ff0000 00ff0000
255	pand xmm1, xmm6					; clear all yv values leaving 0u00 etc
256	psrld xmm1,8					; rotate u to get u000
257	pshuflw xmm1,xmm1, 0xA0			; copy u values
258	pshufhw xmm1,xmm1, 0xA0			; to get u0u0
259; extract v
260	pslld xmm6,16					; 000000ff000000ff 000000ff000000ff
261	pand xmm2, xmm6					; clear all yu values leaving 000v etc
262	psrld xmm2,8					; rotate v to get 00v0
263	pshuflw xmm2,xmm2, 0xF5			; copy v values
264	pshufhw xmm2,xmm2, 0xF5			; to get v0v0
265
266yuv2rgbsse2
267	
268rgba32sse2output
269
270; endloop
271	add edi,32
272	add esi,16
273	sub ecx, 1				; apparently sub is better than dec
274	jnz REPEATLOOP
275ENDLOOP:
276; Cleanup
277	pop ecx
278	pop esi
279	pop edi
280	mov esp, ebp
281	pop ebp
282	ret
283
284cglobal Convert_YUV420P_RGBA32_SSE2
285; reserve variables
286	push ebp
287	mov ebp, esp
288	push edi
289	push esi
290	push ecx
291	push eax
292	push ebx
293		
294	mov esi, [fromYPtr]
295	mov eax, [fromUPtr]
296	mov ebx, [fromVPtr]
297	mov edi, [toPtr1]
298	mov ecx, [width1]
299; loop width / 8 times
300	shr ecx,3
301	test ecx,ecx
302	jng ENDLOOP1
303REPEATLOOP1:						; loop over width / 8
304; YUV420 Planar inputer
305	movq xmm0, [esi]				; fetch 8 y values (8 bit) yyyyyyyy00000000
306	movd xmm1, [eax]				; fetch 4 u values (8 bit) uuuu000000000000
307	movd xmm2, [ebx]				; fetch 4 v values (8 bit) vvvv000000000000
308	
309; extract y
310	pxor xmm7,xmm7					; 00000000000000000000000000000000
311	punpcklbw xmm0,xmm7				; interleave xmm7 into xmm0 y0y0y0y0y0y0y0y0
312; extract u and duplicate so each becomes 0u0u
313	punpcklbw xmm1,xmm7				; interleave xmm7 into xmm1 u0u0u0u000000000
314	punpcklwd xmm1,xmm7				; interleave again u000u000u000u000
315	pshuflw xmm1,xmm1, 0xA0			; copy u values
316	pshufhw xmm1,xmm1, 0xA0			; to get u0u0
317; extract v
318	punpcklbw xmm2,xmm7				; interleave xmm7 into xmm1 v0v0v0v000000000
319	punpcklwd xmm2,xmm7				; interleave again v000v000v000v000
320	pshuflw xmm2,xmm2, 0xA0			; copy v values
321	pshufhw xmm2,xmm2, 0xA0			; to get v0v0
322
323yuv2rgbsse2
324	
325rgba32sse2output
326
327; endloop
328	add edi,32
329	add esi,8
330	add eax,4
331	add ebx,4
332	sub ecx, 1				; apparently sub is better than dec
333	jnz REPEATLOOP1
334ENDLOOP1:
335; Cleanup
336	pop ebx
337	pop eax
338	pop ecx
339	pop esi
340	pop edi
341	mov esp, ebp
342	pop ebp
343	ret
344
345cglobal Convert_YUV422_RGBA32_SSE
346; reserve variables
347	push ebp
348	mov ebp, esp
349	push edi
350	push esi
351	push ecx
352	
353	mov esi, [fromPtr]
354	mov ecx, [width]
355	mov edi, [toPtr]
356; loop width / 4 times
357	shr ecx,2
358	test ecx,ecx
359	jng ENDLOOP2
360REPEATLOOP2:						; loop over width / 4
361
362; YUV422 packed inputer
363	movq mm0, [esi]					; should have yuyv yuyv
364	pshufw mm1, mm0, 0xE4			; copy to mm1
365	movq mm2, mm0					; copy to mm2
366; extract y
367	pxor mm7,mm7					; 0000000000000000
368	pcmpeqb mm6,mm6					; ffffffffffffffff
369	punpckhbw mm6,mm7				; interleave mm7 into mm6 ff00ff00ff00ff00
370	pand mm0, mm6					; clear all but y values leaving y0y0 etc
371; extract u and duplicate so each u in yuyv becomes 0u0u
372	psrld mm6,8						; 00ff0000 00ff0000 
373	pand mm1, mm6					; clear all yv values leaving 0u00 etc
374	psrld mm1,8						; rotate u to get u000
375	pshufw mm1,mm1, 0xA0			; copy u values	to get u0u0		(SSE not MMX)
376; extract v
377	pslld mm6,16					; 000000ff000000ff
378	pand mm2, mm6					; clear all yu values leaving 000v etc
379	psrld mm2,8						; rotate v to get 00v0
380	pshufw mm2,mm2, 0xF5			; copy v values to get v0v0    (SSE not MMX)
381
382yuv2rgbsse
383
384rgba32sseoutput
385
386	; endloop
387	add edi,16
388	add esi,8
389	sub ecx, 1						; apparently sub is better than dec
390	jnz REPEATLOOP2
391ENDLOOP2:
392; Cleanup
393	emms							; reset mmx regs back to float
394	pop ecx
395	pop esi
396	pop edi
397	mov esp, ebp
398	pop ebp
399	ret
400
401cglobal Convert_YUV420P_RGBA32_SSE
402; reserve variables
403	push ebp
404	mov ebp, esp
405	push edi
406	push esi
407	push ecx
408	push eax
409	push ebx
410		
411	mov esi, [fromYPtr]
412	mov eax, [fromUPtr]
413	mov ebx, [fromVPtr]
414	mov edi, [toPtr1]
415	mov ecx, [width1]
416; loop width / 4 times
417	shr ecx,2
418	test ecx,ecx
419	jng ENDLOOP3
420REPEATLOOP3:						; loop over width / 4
421; YUV420 Planar inputer
422	movq mm0, [esi]					; fetch 4 y values (8 bit) yyyy0000
423	movd mm1, [eax]					; fetch 2 u values (8 bit) uu000000
424	movd mm2, [ebx]					; fetch 2 v values (8 bit) vv000000
425	
426; extract y
427	pxor mm7,mm7					; 0000000000000000
428	punpcklbw mm0,mm7				; interleave xmm7 into xmm0 y0y0y0y
429; extract u and duplicate so each becomes 0u0u
430	punpcklbw mm1,mm7				; interleave xmm7 into xmm1 u0u00000
431	punpcklwd mm1,mm7				; interleave again u000u000
432	pshufw mm1,mm1, 0xA0			; copy u values to get u0u0
433; extract v
434	punpcklbw mm2,mm7				; interleave xmm7 into xmm1 v0v00000
435	punpcklwd mm2,mm7				; interleave again v000v000
436	pshufw mm2,mm2, 0xA0			; copy v values to get v0v0
437
438yuv2rgbsse
439	
440rgba32sseoutput
441
442; endloop
443	add edi,16
444	add esi,4
445	add eax,2
446	add ebx,2
447	sub ecx, 1				; apparently sub is better than dec
448	jnz REPEATLOOP3
449ENDLOOP3:
450; Cleanup
451	emms
452	pop ebx
453	pop eax
454	pop ecx
455	pop esi
456	pop edi
457	mov esp, ebp
458	pop ebp
459	ret
460
461SECTION .note.GNU-stack noalloc noexec nowrite progbits
462