1;
2; Copyright (C) 2009-2010 David McPaul
3;
4; All rights reserved. Distributed under the terms of the MIT License.
5;
6
7; A rather unoptimised set of ssse3 yuv to rgb converters
8; does 8 pixels per loop
9
10; inputer:
11; reads 128 bits of yuv 8 bit data and puts
12; the y values converted to 16 bit in xmm0
13; the u values converted to 16 bit and duplicated into xmm1
14; the v values converted to 16 bit and duplicated into xmm2
15
16; conversion:
17; does the yuv to rgb conversion using 16 bit fixed point and the
18; results are placed into the following registers as 8 bit clamped values
19; r values in xmm3
20; g values in xmm4
21; b values in xmm5
22
23; outputer:
24; writes out the rgba pixels as 8 bit values with 0 for alpha
25
26; xmm6 used for scratch
27; xmm7 used for scratch
28
29%macro  cglobal 1
30	global  _%1
31	%define %1 _%1
32	align 16
33%1:
34%endmacro
35
36SECTION .data align=16
37
38Const16	dw	16
39	dw	16
40	dw	16
41	dw	16
42	dw	16
43	dw	16
44	dw	16
45	dw	16
46
47Const128	dw	128
48	dw	128
49	dw	128
50	dw	128
51	dw	128
52	dw	128
53	dw	128
54	dw	128
55	
56UMask	db	0x01
57	db	0x80
58	db	0x01
59	db	0x80
60	db	0x05
61	db	0x80
62	db	0x05
63	db	0x80
64	db	0x09
65	db	0x80
66	db	0x09
67	db	0x80
68	db	0x0d
69	db	0x80
70	db	0x0d
71	db	0x80
72
73VMask	db	0x03
74	db	0x80
75	db	0x03
76	db	0x80
77	db	0x07
78	db	0x80
79	db	0x07
80	db	0x80
81	db	0x0b
82	db	0x80
83	db	0x0b
84	db	0x80
85	db	0x0f
86	db	0x80
87	db	0x0f
88	db	0x80
89	
90YMask	db	0x00
91	db	0x80
92	db	0x02
93	db	0x80
94	db	0x04
95	db	0x80
96	db	0x06
97	db	0x80
98	db	0x08
99	db	0x80
100	db	0x0a
101	db	0x80
102	db	0x0c
103	db	0x80
104	db	0x0e
105	db	0x80
106
107UVMask	db	0x01
108	db	0x80
109	db	0x03
110	db	0x80
111	db	0x05
112	db	0x80
113	db	0x07
114	db	0x80
115	db	0x09
116	db	0x80
117	db	0x0b
118	db	0x80
119	db	0x0d
120	db	0x80
121	db	0x0f
122	db	0x80
123
124shuffconst db 0x0
125		db 0x01
126		db 0x00
127		db 0x01
128		db 0x04
129		db 0x05
130		db 0x04
131		db 0x05
132		db 0x08
133		db 0x09
134		db 0x08
135		db 0x09
136		db 0x0c
137		db 0x0d
138		db 0x0c
139		db 0x0d
140
141RConst	dw 0
142		dw 5743
143		dw 0
144		dw 5743
145		dw 0
146		dw 5743
147		dw 0
148		dw 5743
149		
150GConst	dw -1409
151		dw -2925
152		dw -1409
153		dw -2925
154		dw -1409
155		dw -2925
156		dw -1409
157		dw -2925
158		
159BConst	dw 7258
160		dw 0
161		dw 7258
162		dw 0
163		dw 7258
164		dw 0
165		dw 7258
166		dw 0
167
168; conversion code 
169%macro yuv2rgbssse3 0
170; u = u - 128
171; v = v - 128
172; r = y + 0 * u + 1.403 * v
173; g = y + -0.344 * u + -0.714 * v
174; b = y + 1.773 * u + 0 * v
175; subtract 128 from u and v
176	psubsw xmm3, [Const128]			; u = u - 128, v = v -128
177	
178	pshufd xmm5, xmm3, 0xE4			; duplicate
179	movdqa xmm4, xmm3				; duplicate
180	
181; subtract 16 from y
182;	psubsw xmm0, [Const16]			; y = y - 16
183
184	pmaddwd xmm3, [RConst]			; multiply and add
185	pmaddwd xmm4, [GConst]			; to get RGB offsets to Y
186	pmaddwd xmm5, [BConst]			;
187
188	psrad xmm3, 12					; Scale back to original range
189	psrad xmm4, 12					;
190	psrad xmm5, 12					;
191
192	pshufb xmm3, [shuffconst]		; duplicate results
193	pshufb xmm4, [shuffconst]		; 2 y values per const
194	pshufb xmm5, [shuffconst]		;
195
196	paddsw xmm3, xmm0				; and add to y
197	paddsw xmm4, xmm0				;
198	paddsw xmm5, xmm0				;
199%endmacro
200
201; outputer
202%macro rgba32ssse3output 0
203; clamp values
204	pxor xmm7,xmm7
205	packuswb xmm3,xmm7				; clamp to 0,255 and pack R to 8 bit per pixel
206	packuswb xmm4,xmm7				; clamp to 0,255 and pack G to 8 bit per pixel
207	packuswb xmm5,xmm7				; clamp to 0,255 and pack B to 8 bit per pixel
208; convert to bgra32 packed
209	punpcklbw xmm5,xmm4				; bgbgbgbgbgbgbgbg
210	movdqa xmm0, xmm5				; save bg values
211	punpcklbw xmm3,xmm7				; r0r0r0r0r0r0r0r0
212	punpcklwd xmm5,xmm3				; lower half bgr0bgr0bgr0bgr0
213	punpckhwd xmm0,xmm3				; upper half bgr0bgr0bgr0bgr0
214; write to output ptr
215	movntdq [edi], xmm5				; output first 4 pixels bypassing cache
216	movntdq [edi+16], xmm0			; output second 4 pixels bypassing cache
217%endmacro
218
219
220; void Convert_YUV422_RGBA32_SSSE3(void *fromPtr, void *toPtr, int width)
221%define width   ebp+16
222%define toPtr   ebp+12
223%define fromPtr ebp+8
224
225; void Convert_YUV420P_RGBA32_SSSE3(void *fromYPtr, void *fromUPtr, void *fromVPtr, void *toPtr, int width)
226%define width1   ebp+24
227%define toPtr1   ebp+20
228%define fromVPtr ebp+16
229%define fromUPtr ebp+12
230%define fromYPtr ebp+8
231
232SECTION .text align=16
233
234cglobal Convert_YUV422_RGBA32_SSSE3
235; reserve variables
236	push ebp
237	mov ebp, esp
238	push edi
239	push esi
240	push ecx
241	
242	mov esi, [fromPtr]
243	mov edi, [toPtr]
244	mov ecx, [width]
245; loop width / 8 times
246	shr ecx,3
247	test ecx,ecx
248	jng ENDLOOP
249REPEATLOOP:							; loop over width / 8
250	prefetchnta [esi+256]
251; YUV422 packed inputer
252	movdqa xmm0, [esi]				; should have yuyv yuyv yuyv yuyv
253	pshufd xmm3, xmm0, 0xE4			; copy to xmm1
254; extract both y giving y0y0
255	pshufb xmm0, [YMask]
256; extract u and v to have u0v0
257	pshufb xmm3, [UVMask]
258
259yuv2rgbssse3
260	
261rgba32ssse3output
262
263; endloop
264	add edi,32
265	add esi,16
266	sub ecx, 1				; apparently sub is better than dec
267	jnz REPEATLOOP
268ENDLOOP:
269; Cleanup
270	pop ecx
271	pop esi
272	pop edi
273	mov esp, ebp
274	pop ebp
275	ret
276
277cglobal Convert_YUV420P_RGBA32_SSSE3
278; reserve variables
279	push ebp
280	mov ebp, esp
281	push edi
282	push esi
283	push ecx
284	push eax
285	push ebx
286		
287	mov esi, [fromYPtr]
288	mov eax, [fromUPtr]
289	mov ebx, [fromVPtr]
290	mov edi, [toPtr1]
291	mov ecx, [width1]
292; loop width / 8 times
293	shr ecx,3
294	test ecx,ecx
295	jng ENDLOOP1
296REPEATLOOP1:						; loop over width / 8
297	prefetchnta [esi+256]
298	prefetchnta [eax+128]
299	prefetchnta [ebx+128]
300
301; YUV420 Planar inputer
302	movq xmm0, [esi]				; fetch 8 y values (8 bit) yyyyyyyy00000000
303	movd xmm3, [eax]				; fetch 4 u values (8 bit) uuuu000000000000
304	movd xmm1, [ebx]				; fetch 4 v values (8 bit) vvvv000000000000
305	
306; convert y to 16 bit
307	pxor xmm7,xmm7					; 00000000000000000000000000000000
308	punpcklbw xmm0,xmm7				; interleave xmm7 into xmm0 y0y0y0y0y0y0y0y0
309	
310; combine u and v
311	punpcklbw xmm3,xmm1				; uvuvuvuv00000000
312	punpcklbw xmm3,xmm7				; u0v0u0v0u0v0u0v0
313	
314yuv2rgbssse3
315	
316rgba32ssse3output
317
318; endloop
319	add edi,32
320	add esi,8
321	add eax,4
322	add ebx,4
323	sub ecx, 1				; apparently sub is better than dec
324	jnz REPEATLOOP1
325ENDLOOP1:
326; Cleanup
327	pop ebx
328	pop eax
329	pop ecx
330	pop esi
331	pop edi
332	mov esp, ebp
333	pop ebp
334	ret
335
336SECTION .note.GNU-stack noalloc noexec nowrite progbits
337