1/*
2	Copyright (c) 2004, Thomas Kurschel
3
4
5	Part of Radeon In add-on
6
7	YUV converter
8
9	The Rage Theatre always provides YUV422 data and starting with Radeon, ATI
10	moved colour converter from 2D to 3D unit, so if you need another format you
11	must convert it manually, unless you get 3D working (which is, starting with r300,
12	hopeless anyway as there is no spec).
13
14	Colour temperature is according to BT.601; for YCbCr format see also GraphicsDefs.h
15
16	This header is included from VideoIn.cpp, with various defines to convert to
17	the wished format (RGB15, RGB16 or RGB32).
18
19	Things to improve:
20	- colour components should be interpolated for odd pixels
21*/
22
23	static const uint8 c_offs[8] =
24		{ 128, 128, 128, 128, 128, 128, 128, 128 };
25
26	static const int16 y_offs[4] =
27		{ 16*128, 16*128, 16*128, 16*128 };
28
29	static const uint16 masks[2][4] = {
30		// high byte mask
31		{ 0xff00, 0xff00, 0xff00, 0xff00 },
32		// low byte mask
33		{ 0x00ff, 0x00ff, 0x00ff, 0x00ff },
34	};
35
36	static const int16 scale[5][4] = {
37		// Y pre-scale
38		{ (int16)(1.1678 * 512), (int16)(1.1678 * 512), (int16)(1.1678 * 512), (int16)(1.1678 * 512) },
39		// CbG CrG CbG CrG
40		{ (int16)(-0.3929 * 256), (int16)(-0.8154 * 256), (int16)(-0.3929 * 256), (int16)(-0.8154 * 256) },
41		// CbB CrR CbB CrR
42		{ (int16)(2.0232 * 256), (int16)(1.6007 * 256), (int16)(2.0232 * 256), (int16)(1.6007 * 256) },
43	};
44
45	static const int8 masks_8bit[2][8] = {
46		// r/b 16 bit mask and r/g/b 15 bit mask
47		{ 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8 },
48		// g 16 bit mask
49		{ 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc },
50	};
51
52	asm volatile(
53	"2:\n"
54		"pxor		%%mm7,%%mm7\n"
55
56	"1:\n"
57		"movq		(%0),%%mm0\n"		// mm0 = Cr2'Y3' Cb2'Y2' Cr0'Y1' Cb0'Y0'
58		"movq		%%mm0,%%mm1\n"		// mm1 = Cr2'Y3' Cb2'Y2' Cr0'Y1' Cb0'Y0'
59
60		// Y is in 16..235
61		// we need to substract 16 and scale to full range
62		// as standard MMX has a _signed_ integer multiply only, the highest bit must
63		// be zero before scaling, i.e. we use signed format 9.7
64		"pand		8+%4,%%mm0\n"		// mm2 =     Y3'     Y2'     Y1'     Y0'
65		"psllw		$7,%%mm0\n"
66		"psubusw	%3,%%mm0\n"
67		"pmulhw		%5,%%mm0\n"			// mm0 =      Y3      Y2      Y1      Y0
68
69		// Cb and Cr is biased; compensate that
70		"psubb		%2,%%mm1\n"			// mm1 = Cr2 xxx Cb2 xxx Cr0 xxx Cb0 xxx
71		"pand		%4,%%mm1\n"			// mm1 = Cr2     Cb2     Cr0     Cb0
72
73		// transform Cb and Cr to green component
74		"movq		%%mm1,%%mm2\n"
75		"pmaddwd	8+%5,%%mm1\n"		// mm1 =  CbCrG2 xxxxxxx  CbCrG0 xxxxxxx
76		"psrad		$16,%%mm1\n"		// mm1 =          CbCrG2          CbCrG0
77		"packssdw	%%mm1,%%mm1\n"		// mm1 =  CbCrG2  CbCrG0  CbCrG2  CbCrG0
78		"punpcklwd	%%mm1,%%mm1\n"		// mm1 =  CbCrG2  CbCrG2  CbCrG0  CbCrG0
79
80		// transform Cb to blue and Cr to red component
81		"pmulhw		16+%5,%%mm2\n"		// mm2 =    CrR2    CbB2    CrR0    CbB0
82
83		// nasty shuffling to separate and duplicate components
84		"movq		%%mm2,%%mm3\n"
85		"punpcklwd	%%mm3,%%mm3\n"		// mm3 =    CrR0    CrR0    CbB0    CbB0
86		"punpckhwd	%%mm2,%%mm2\n"		// mm2 =    CrR2    CrR2    CbB2    CbB2
87
88		"movq		%%mm3,%%mm4\n"
89		"punpckldq	%%mm2,%%mm3\n"		// mm3 =    CbB2    CbB2    CbB0    CbB0
90		"punpckhdq	%%mm2,%%mm4\n"		// mm4 =    CrR2    CrR2    CrR0    CrR0
91
92		// add Y to get final RGB
93		"paddsw		%%mm0,%%mm1\n"		// mm1 =      G3      G2      G1      G0
94		"paddsw		%%mm0,%%mm3\n"		// mm3 =      B3      B2      B1      B0
95		"paddsw		%%mm0,%%mm4\n"		// mm4 =      R3      R2      R1      R0
96
97		// now, RBG can be converted to 8 bits each
98		"packuswb	%%mm0,%%mm1\n"		// mm1 =  Y3  Y2  Y1  Y0  G3  G2  G1  G0
99		"packuswb	%%mm4,%%mm3\n"		// mm3 =  R3  R2  R1  R0  B3  B2  B1  B0
100
101#ifdef RGB32
102		// convertion to RGB32
103		"movq		%%mm3,%%mm2\n"
104		"punpckhbw	%%mm1,%%mm3\n"		// mm3 =  Y3  R3  Y2  R2  Y1  R1  Y0  R0
105		"punpcklbw	%%mm1,%%mm2\n"		// mm2 =  G3  B3  G2  B2  G1  B1  G0  B0
106
107		"movq		%%mm2,%%mm1\n"
108		"punpcklwd	%%mm3,%%mm2\n"
109		"movq		%%mm2,0x00(%1)\n"	// dst =  Y1  R1  G1  B1  Y0  R0  G0  B0
110
111		"punpckhwd	%%mm3,%%mm1\n"
112		"movq		%%mm1,0x08(%1)\n"	// dst =  Y3  R3  G3  B3  Y2  R2  G2  B2
113
114		"addl		$0x08,%0\n"			// source += 8
115		"addl		$0x10,%1\n"			// destination += 16
116		"subl		$0x10,%7\n"			// next pixels
117#endif
118
119#ifdef RGB16
120		// convertion to RGB16
121		// things would be much easier if Intel had added a RGB32->RGB16 instruction
122		"pand		%6,%%mm3\n"			//  mm3 -  R3  R2  R1  R0  B3  B2  B1  B0 (masked)
123		"pand		8+%6,%%mm1\n"		//  mm1 -  Y3  Y2  Y1  Y0  G3  G2  G1  G0 (masked)
124
125		"punpcklbw	%%mm7,%%mm1\n"		//  mm1 -      G3      G2      G1      G0
126		"movq		%%mm7,%%mm2\n"
127		"punpckhbw 	%%mm3,%%mm2\n"		//  mm2 -  R3      R2      R1      R0
128		"punpcklbw 	%%mm7,%%mm3\n"		//  mm3 -      B3      B2      B1      B0
129
130		"psllw		$3,%%mm1\n"			//  mm1 -    G3      G2      G1      G0
131		"psrlw		$3,%%mm3\n"			//  mm3 -      B3      B2      B1      B0
132
133		"por		%%mm2,%%mm1\n"
134		"por		%%mm3,%%mm1\n"
135		"movq		%%mm1,(%1)\n"
136
137		"addl		$0x08,%0\n"			// source += 8
138		"addl		$0x08,%1\n"			// destination += 8
139		"subl		$0x08,%7\n"			// next pixels
140#endif
141
142#ifdef RGB15
143		// convertion to RGB15
144		// same problem as before
145		"pand		%6,%%mm3\n"			//  mm3 -  R3  R2  R1  R0  B3  B2  B1  B0 (masked)
146		"pand		%6,%%mm1\n"			//  mm1 -  Y3  Y2  Y1  Y0  G3  G2  G1  G0 (masked)
147
148		"punpcklbw	%%mm7,%%mm1\n"		//  mm1 -      G3      G2      G1      G0
149		"movq		%%mm7,%%mm2\n"
150		"punpckhbw 	%%mm3,%%mm2\n"		//  mm2 -  R3      R2      R1      R0
151		"punpcklbw 	%%mm7,%%mm3\n"		//  mm3 -      B3      B2      B1      B0
152
153		"psllw		$2,%%mm1\n"			//  mm1 -    G3      G2      G1      G0
154		"psrlw		$1,%%mm2\n"			//  mm2 -  R3      R2      R1      R0
155		"psrlw		$3,%%mm3\n"			//  mm3 -      B3      B2      B1      B0
156
157		"por		%%mm2,%%mm1\n"
158		"por		%%mm3,%%mm1\n"
159		"movq		%%mm1,(%1)\n"
160
161		"addl		$0x08,%0\n"			// source += 8
162		"addl		$0x08,%1\n"			// destination += 8
163		"subl		$0x08,%7\n"			// next pixels
164#endif
165
166		// next
167		"jg			1b\n"
168
169		"movl		%9,%7\n"
170		"subl		%7,%8\n"
171
172		"jg			2b\n"
173		"emms\n"
174		:
175		: "a" (convert_buffer), "d" (bits),
176		  "g" (c_offs), "g" (y_offs), "g" (masks), "g" (scale), "g" (masks_8bit),
177		  "c" (bytesPerRow), "S" (bitsLength), "D" (bytesPerRow));
178