/*
	Copyright (c) 2004, Thomas Kurschel
	

	Part of Radeon In add-on
		
	YUV converter
	
	The Rage Theatre always provides YUV422 data and starting with Radeon, ATI
	moved colour converter from 2D to 3D unit, so if you need another format you
	must convert it manually, unless you get 3D working (which is, starting with r300,
	hopeless anyway as there is no spec). 
	
	Colour temperature is according to BT.601; for YCbCr format see also GraphicsDefs.h
	
	This header is included from VideoIn.cpp, with various defines to convert to
	the wished format (RGB15, RGB16 or RGB32).
	
	Things to improve:
	- colour components should be interpolated for odd pixels
*/

	static const uint8 c_offs[8] =
		{ 128, 128, 128, 128, 128, 128, 128, 128 };
		
	static const int16 y_offs[4] =
		{ 16*128, 16*128, 16*128, 16*128 };

	static const uint16 masks[2][4] = {
		// high byte mask
		{ 0xff00, 0xff00, 0xff00, 0xff00 },
		// low byte mask
		{ 0x00ff, 0x00ff, 0x00ff, 0x00ff },
	};
	
	static const int16 scale[5][4] = {
		// Y pre-scale
		{ (int16)(1.1678 * 512), (int16)(1.1678 * 512), (int16)(1.1678 * 512), (int16)(1.1678 * 512) },
		// CbG CrG CbG CrG
		{ (int16)(-0.3929 * 256), (int16)(-0.8154 * 256), (int16)(-0.3929 * 256), (int16)(-0.8154 * 256) },
		// CbB CrR CbB CrR
		{ (int16)(2.0232 * 256), (int16)(1.6007 * 256), (int16)(2.0232 * 256), (int16)(1.6007 * 256) },
	};

	static const int8 masks_8bit[2][8] = {	
		// r/b 16 bit mask and r/g/b 15 bit mask
		{ 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8 },
		// g 16 bit mask
		{ 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc },
	};

	asm volatile(
	"2:\n"
		"pxor		%%mm7,%%mm7\n"
	
	"1:\n"
		"movq		(%0),%%mm0\n"		// mm0 = Cr2'Y3' Cb2'Y2' Cr0'Y1' Cb0'Y0'
		"movq		%%mm0,%%mm1\n"		// mm1 = Cr2'Y3' Cb2'Y2' Cr0'Y1' Cb0'Y0'
		
		// Y is in 16..235
		// we need to substract 16 and scale to full range
		// as standard MMX has a _signed_ integer multiply only, the highest bit must 
		// be zero before scaling, i.e. we use signed format 9.7
		"pand		8+%4,%%mm0\n"		// mm2 =     Y3'     Y2'     Y1'     Y0'
		"psllw		$7,%%mm0\n"
		"psubusw	%3,%%mm0\n"
		"pmulhw		%5,%%mm0\n"			// mm0 =      Y3      Y2      Y1      Y0

		// Cb and Cr is biased; compensate that		
		"psubb		%2,%%mm1\n"			// mm1 = Cr2 xxx Cb2 xxx Cr0 xxx Cb0 xxx
		"pand		%4,%%mm1\n"			// mm1 = Cr2     Cb2     Cr0     Cb0
		
		// transform Cb and Cr to green component
		"movq		%%mm1,%%mm2\n"
		"pmaddwd	8+%5,%%mm1\n"		// mm1 =  CbCrG2 xxxxxxx  CbCrG0 xxxxxxx
		"psrad		$16,%%mm1\n"		// mm1 =          CbCrG2          CbCrG0
		"packssdw	%%mm1,%%mm1\n"		// mm1 =  CbCrG2  CbCrG0  CbCrG2  CbCrG0
		"punpcklwd	%%mm1,%%mm1\n"		// mm1 =  CbCrG2  CbCrG2  CbCrG0  CbCrG0
		
		// transform Cb to blue and Cr to red component
		"pmulhw		16+%5,%%mm2\n"		// mm2 =    CrR2    CbB2    CrR0    CbB0
		
		// nasty shuffling to separate and duplicate components
		"movq		%%mm2,%%mm3\n"
		"punpcklwd	%%mm3,%%mm3\n"		// mm3 =    CrR0    CrR0    CbB0    CbB0
		"punpckhwd	%%mm2,%%mm2\n"		// mm2 =    CrR2    CrR2    CbB2    CbB2
		
		"movq		%%mm3,%%mm4\n"
		"punpckldq	%%mm2,%%mm3\n"		// mm3 =    CbB2    CbB2    CbB0    CbB0
		"punpckhdq	%%mm2,%%mm4\n"		// mm4 =    CrR2    CrR2    CrR0    CrR0
		
		// add Y to get final RGB			
		"paddsw		%%mm0,%%mm1\n"		// mm1 =      G3      G2      G1      G0
		"paddsw		%%mm0,%%mm3\n"		// mm3 =      B3      B2      B1      B0
		"paddsw		%%mm0,%%mm4\n"		// mm4 =      R3      R2      R1      R0

		// now, RBG can be converted to 8 bits each
		"packuswb	%%mm0,%%mm1\n"		// mm1 =  Y3  Y2  Y1  Y0  G3  G2  G1  G0
		"packuswb	%%mm4,%%mm3\n"		// mm3 =  R3  R2  R1  R0  B3  B2  B1  B0

#ifdef RGB32
		// convertion to RGB32
		"movq		%%mm3,%%mm2\n"
		"punpckhbw	%%mm1,%%mm3\n"		// mm3 =  Y3  R3  Y2  R2  Y1  R1  Y0  R0
		"punpcklbw	%%mm1,%%mm2\n"		// mm2 =  G3  B3  G2  B2  G1  B1  G0  B0
		
		"movq		%%mm2,%%mm1\n"
		"punpcklwd	%%mm3,%%mm2\n"
		"movq		%%mm2,0x00(%1)\n"	// dst =  Y1  R1  G1  B1  Y0  R0  G0  B0

		"punpckhwd	%%mm3,%%mm1\n"
		"movq		%%mm1,0x08(%1)\n"	// dst =  Y3  R3  G3  B3  Y2  R2  G2  B2
		
		"addl		$0x08,%0\n"			// source += 8
		"addl		$0x10,%1\n"			// destination += 16
		"subl		$0x10,%7\n"			// next pixels
#endif

#ifdef RGB16
		// convertion to RGB16
		// things would be much easier if Intel had added a RGB32->RGB16 instruction
		"pand		%6,%%mm3\n"			//  mm3 -  R3  R2  R1  R0  B3  B2  B1  B0 (masked)
		"pand		8+%6,%%mm1\n"		//  mm1 -  Y3  Y2  Y1  Y0  G3  G2  G1  G0 (masked)
		
		"punpcklbw	%%mm7,%%mm1\n"		//  mm1 -      G3      G2      G1      G0
		"movq		%%mm7,%%mm2\n"
		"punpckhbw 	%%mm3,%%mm2\n"		//  mm2 -  R3      R2      R1      R0
		"punpcklbw 	%%mm7,%%mm3\n"		//  mm3 -      B3      B2      B1      B0
		
		"psllw		$3,%%mm1\n"			//  mm1 -    G3      G2      G1      G0
		"psrlw		$3,%%mm3\n"			//  mm3 -      B3      B2      B1      B0
		
		"por		%%mm2,%%mm1\n"
		"por		%%mm3,%%mm1\n"
		"movq		%%mm1,(%1)\n"

		"addl		$0x08,%0\n"			// source += 8
		"addl		$0x08,%1\n"			// destination += 8
		"subl		$0x08,%7\n"			// next pixels
#endif

#ifdef RGB15
		// convertion to RGB15
		// same problem as before
		"pand		%6,%%mm3\n"			//  mm3 -  R3  R2  R1  R0  B3  B2  B1  B0 (masked)
		"pand		%6,%%mm1\n"			//  mm1 -  Y3  Y2  Y1  Y0  G3  G2  G1  G0 (masked)
		
		"punpcklbw	%%mm7,%%mm1\n"		//  mm1 -      G3      G2      G1      G0
		"movq		%%mm7,%%mm2\n"
		"punpckhbw 	%%mm3,%%mm2\n"		//  mm2 -  R3      R2      R1      R0
		"punpcklbw 	%%mm7,%%mm3\n"		//  mm3 -      B3      B2      B1      B0
		
		"psllw		$2,%%mm1\n"			//  mm1 -    G3      G2      G1      G0
		"psrlw		$1,%%mm2\n"			//  mm2 -  R3      R2      R1      R0
		"psrlw		$3,%%mm3\n"			//  mm3 -      B3      B2      B1      B0
		
		"por		%%mm2,%%mm1\n"
		"por		%%mm3,%%mm1\n"
		"movq		%%mm1,(%1)\n"

		"addl		$0x08,%0\n"			// source += 8
		"addl		$0x08,%1\n"			// destination += 8
		"subl		$0x08,%7\n"			// next pixels
#endif

		// next
		"jg			1b\n"
		
		"movl		%9,%7\n"
		"subl		%7,%8\n"
		
		"jg			2b\n"
		"emms\n"
		:
		: "a" (convert_buffer), "d" (bits), 
		  "g" (c_offs), "g" (y_offs), "g" (masks), "g" (scale), "g" (masks_8bit),
		  "c" (bytesPerRow), "S" (bitsLength), "D" (bytesPerRow));