/* NV Acceleration functions */

/* Author:
   Rudolf Cornelissen 8/2003-6/2008.

   This code was possible thanks to:
    - the Linux XFree86 NV driver,
    - the Linux UtahGLX 3D driver.
*/

#define MODULE_BIT 0x00080000

#include "nv_std.h"

/*acceleration notes*/

/*functions Be's app_server uses:
fill span (horizontal only)
fill rectangle (these 2 are very similar)
invert rectangle 
blit
*/

static void nv_start_dma(void);
static status_t nv_acc_fifofree_dma(uint16 cmd_size);
static void nv_acc_cmd_dma(uint32 cmd, uint16 offset, uint16 size);
static void nv_acc_set_ch_dma(uint16 ch, uint32 handle);

/* used to track engine DMA stalls */
static uint8 err;

/* wait until engine completely idle */
status_t nv_acc_wait_idle_dma()
{
	/* we'd better check for timeouts on the DMA engine as it's theoretically
	 * breakable by malfunctioning software */
	uint16 cnt = 0;
return B_OK;

	/* wait until all upcoming commands are in execution at least. Do this until
	 * we hit a timeout; abort if we failed at least three times before:
	 * if DMA stalls, we have to forget about it alltogether at some point, or
	 * the system will almost come to a complete halt.. */
	/* note:
	 * it doesn't matter which FIFO channel's DMA registers we access, they are in
	 * fact all the same set. It also doesn't matter if the channel was assigned a
	 * command or not. */
	while ((NV_REG32(NVACC_FIFO + NV_GENERAL_DMAGET) != (si->engine.dma.put << 2)) &&
			(cnt < 10000) && (err < 3))
	{
		/* snooze a bit so I do not hammer the bus */
		snooze (100);
		cnt++;
	}

	/* log timeout if we had one */
	if (cnt == 10000)
	{
		if (err < 3) err++;
		LOG(4,("ACC_DMA: wait_idle; DMA timeout #%d, engine trouble!\n", err));
	}

	/* wait until execution completed */
	while (ACCR(STATUS))
	{
		/* snooze a bit so I do not hammer the bus */
		snooze (100);
	}

	return B_OK;
}

/* AFAIK this must be done for every new screenmode.
 * Engine required init. */
status_t nv_acc_init_dma()
{
	uint32 cnt, tmp;
	uint32 surf_depth, cmd_depth;
	/* reset the engine DMA stalls counter */
	err = 0;

	/* a hanging engine only recovers from a complete power-down/power-up cycle */
	NV_REG32(NV32_PWRUPCTRL) = 0x13110011;
	snooze(1000);
	NV_REG32(NV32_PWRUPCTRL) = 0x13111111;

	/* setup PTIMER: */
	//fixme? how about NV28 setup as just after coldstarting? (see nv_info.c)
	/* set timer numerator to 8 (in b0-15) */
	ACCW(PT_NUMERATOR, 0x00000008);
	/* set timer denominator to 3 (in b0-15) */
	ACCW(PT_DENOMINATR, 0x00000003);

	/* disable timer-alarm INT requests (b0) */
	ACCW(PT_INTEN, 0x00000000);
	/* reset timer-alarm INT status bit (b0) */
	ACCW(PT_INTSTAT, 0xffffffff);

	/* setup acc engine 'source' tile adressranges */
	if ((si->ps.card_type == NV40) || (si->ps.card_type == NV45))
	{
		ACCW(NV10_FBTIL0AD, 0);
		ACCW(NV10_FBTIL1AD, 0);
		ACCW(NV10_FBTIL2AD, 0);
		ACCW(NV10_FBTIL3AD, 0);
		ACCW(NV10_FBTIL4AD, 0);
		ACCW(NV10_FBTIL5AD, 0);
		ACCW(NV10_FBTIL6AD, 0);
		ACCW(NV10_FBTIL7AD, 0);
		ACCW(NV10_FBTIL0ED, (si->ps.memory_size - 1));
		ACCW(NV10_FBTIL1ED, (si->ps.memory_size - 1));
		ACCW(NV10_FBTIL2ED, (si->ps.memory_size - 1));
		ACCW(NV10_FBTIL3ED, (si->ps.memory_size - 1));
		ACCW(NV10_FBTIL4ED, (si->ps.memory_size - 1));
		ACCW(NV10_FBTIL5ED, (si->ps.memory_size - 1));
		ACCW(NV10_FBTIL6ED, (si->ps.memory_size - 1));
		ACCW(NV10_FBTIL7ED, (si->ps.memory_size - 1));
	}
	else
	{
		/* NV41, 43, 44, G70 and up */
		ACCW(NV41_FBTIL0AD, 0);
		ACCW(NV41_FBTIL1AD, 0);
		ACCW(NV41_FBTIL2AD, 0);
		ACCW(NV41_FBTIL3AD, 0);
		ACCW(NV41_FBTIL4AD, 0);
		ACCW(NV41_FBTIL5AD, 0);
		ACCW(NV41_FBTIL6AD, 0);
		ACCW(NV41_FBTIL7AD, 0);
		ACCW(NV41_FBTIL8AD, 0);
		ACCW(NV41_FBTIL9AD, 0);
		ACCW(NV41_FBTILAAD, 0);
		ACCW(NV41_FBTILBAD, 0);
		ACCW(NV41_FBTIL0ED, (si->ps.memory_size - 1));
		ACCW(NV41_FBTIL1ED, (si->ps.memory_size - 1));
		ACCW(NV41_FBTIL2ED, (si->ps.memory_size - 1));
		ACCW(NV41_FBTIL3ED, (si->ps.memory_size - 1));
		ACCW(NV41_FBTIL4ED, (si->ps.memory_size - 1));
		ACCW(NV41_FBTIL5ED, (si->ps.memory_size - 1));
		ACCW(NV41_FBTIL6ED, (si->ps.memory_size - 1));
		ACCW(NV41_FBTIL7ED, (si->ps.memory_size - 1));
		ACCW(NV41_FBTIL8ED, (si->ps.memory_size - 1));
		ACCW(NV41_FBTIL9ED, (si->ps.memory_size - 1));
		ACCW(NV41_FBTILAED, (si->ps.memory_size - 1));
		ACCW(NV41_FBTILBED, (si->ps.memory_size - 1));

		if (si->ps.card_type >= G70)
		{
			ACCW(G70_FBTILCAD, 0);
			ACCW(G70_FBTILDAD, 0);
			ACCW(G70_FBTILEAD, 0);
			ACCW(G70_FBTILCED, (si->ps.memory_size - 1));
			ACCW(G70_FBTILDED, (si->ps.memory_size - 1));
			ACCW(G70_FBTILEED, (si->ps.memory_size - 1));
		}
	}

	/*** PRAMIN ***/
	/* first clear the entire RAMHT (hash-table) space to a defined state. It turns
	 * out at least NV11 will keep the previously programmed handles over resets and
	 * power-outages upto about 15 seconds!! Faulty entries might well hang the
	 * engine (confirmed on NV11).
	 * Note:
	 * this behaviour is not very strange: even very old DRAM chips are known to be
	 * able to do this, even though you should refresh them every few milliseconds or
	 * so. (Large memory cell capacitors, though different cells vary a lot in their
	 * capacity.)
	 * Of course data validity is not certain by a long shot over this large
	 * amount of time.. */
	for(cnt = 0; cnt < 0x0400; cnt++)
		NV_REG32(NVACC_HT_HANDL_00 + (cnt << 2)) = 0;
	/* RAMHT (hash-table) space SETUP FIFO HANDLES */
	/* note:
	 * 'instance' tells you where the engine command is stored in 'PR_CTXx_x' sets
	 * below: instance being b4-19 with baseadress NV_PRAMIN_CTX_0 (0x00700000).
	 * That command is linked to the handle noted here. This handle is then used to
	 * tell the FIFO to which engine command it is connected!
	 * (CTX registers are actually a sort of RAM space.) */

	/* (first set) */
	ACCW(HT_HANDL_00, (0x80000000 | NV10_CONTEXT_SURFACES_2D)); /* 32bit handle (not used) */
	ACCW(HT_VALUE_00, 0x0010114c); /* instance $114c, engine = acc engine, CHID = $00 */

	ACCW(HT_HANDL_01, (0x80000000 | NV_IMAGE_BLIT)); /* 32bit handle */
	ACCW(HT_VALUE_01, 0x00101148); /* instance $1148, engine = acc engine, CHID = $00 */

	ACCW(HT_HANDL_02, (0x80000000 | NV4_GDI_RECTANGLE_TEXT)); /* 32bit handle */
	ACCW(HT_VALUE_02, 0x0010114a); /* instance $114a, engine = acc engine, CHID = $00 */

	/* (second set) */
	ACCW(HT_HANDL_10, (0x80000000 | NV_ROP5_SOLID)); /* 32bit handle */
	ACCW(HT_VALUE_10, 0x00101142); /* instance $1142, engine = acc engine, CHID = $00 */

	ACCW(HT_HANDL_11, (0x80000000 | NV_IMAGE_BLACK_RECTANGLE)); /* 32bit handle */
	ACCW(HT_VALUE_11, 0x00101144); /* instance $1144, engine = acc engine, CHID = $00 */

	ACCW(HT_HANDL_12, (0x80000000 | NV_IMAGE_PATTERN)); /* 32bit handle */
	ACCW(HT_VALUE_12, 0x00101146); /* instance $1146, engine = acc engine, CHID = $00 */

	ACCW(HT_HANDL_13, (0x80000000 | NV_SCALED_IMAGE_FROM_MEMORY)); /* 32bit handle */
	ACCW(HT_VALUE_13, 0x0010114e); /* instance $114e, engine = acc engine, CHID = $00 */

	/* program CTX registers: CTX1 is mostly done later (colorspace dependant) */
	/* note:
	 * CTX determines which HT handles point to what engine commands. */
	/* note also:
	 * CTX registers are in fact in the same GPU internal RAM space as the engine's
	 * hashtable. This means that stuff programmed in here also survives resets and
	 * power-outages! (confirmed NV11) */

	/* setup a DMA define for use by command defines below. */
	ACCW(PR_CTX0_R, 0x00003000); /* DMA page table present and of linear type;
								  * DMA target node is NVM (non-volatile memory?)
								  * (instead of doing PCI or AGP transfers) */
	ACCW(PR_CTX1_R, (si->ps.memory_size - 1)); /* DMA limit: size is all cardRAM */
	ACCW(PR_CTX2_R, ((0x00000000 & 0xfffff000) | 0x00000002));
								 /* DMA access type is READ_AND_WRITE;
								  * memory starts at start of cardRAM (b12-31):
								  * It's adress needs to be at a 4kb boundary! */
	ACCW(PR_CTX3_R, 0x00000002); /* unknown (looks like this is rubbish/not needed?) */
	/* setup set '0' for cmd NV_ROP5_SOLID */
	ACCW(PR_CTX0_0, 0x02080043); /* NVclass $043, patchcfg ROP_AND, nv10+: little endian */
	ACCW(PR_CTX1_0, 0x00000000); /* colorspace not set, notify instance invalid (b16-31) */
	ACCW(PR_CTX2_0, 0x00000000); /* DMA0 and DMA1 instance invalid */
	ACCW(PR_CTX3_0, 0x00000000); /* method traps disabled */
	ACCW(PR_CTX0_1, 0x00000000); /* extra */
	ACCW(PR_CTX1_1, 0x00000000); /* extra */
	/* setup set '1' for cmd NV_IMAGE_BLACK_RECTANGLE */
	ACCW(PR_CTX0_2, 0x02080019); /* NVclass $019, patchcfg ROP_AND, nv10+: little endian */
	ACCW(PR_CTX1_2, 0x00000000); /* colorspace not set, notify instance invalid (b16-31) */
	ACCW(PR_CTX2_2, 0x00000000); /* DMA0 and DMA1 instance invalid */
	ACCW(PR_CTX3_2, 0x00000000); /* method traps disabled */
	ACCW(PR_CTX0_3, 0x00000000); /* extra */
	ACCW(PR_CTX1_3, 0x00000000); /* extra */
	/* setup set '2' for cmd NV_IMAGE_PATTERN */
	ACCW(PR_CTX0_4, 0x02080018); /* NVclass $018, patchcfg ROP_AND, nv10+: little endian */
	ACCW(PR_CTX1_4, 0x02000000); /* colorspace not set, notify instance is $0200 (b16-31) */
	ACCW(PR_CTX2_4, 0x00000000); /* DMA0 and DMA1 instance invalid */
	ACCW(PR_CTX3_4, 0x00000000); /* method traps disabled */
	ACCW(PR_CTX0_5, 0x00000000); /* extra */
	ACCW(PR_CTX1_5, 0x00000000); /* extra */
	/* setup set '4' for cmd NV12_IMAGE_BLIT */
	ACCW(PR_CTX0_6, 0x0208009f); /* NVclass $09f, patchcfg ROP_AND, nv10+: little endian */
	ACCW(PR_CTX1_6, 0x00000000); /* colorspace not set, notify instance invalid (b16-31) */
	ACCW(PR_CTX2_6, 0x00001140); /* DMA0 instance is $1140, DMA1 instance invalid */
	ACCW(PR_CTX3_6, 0x00001140); /* method trap 0 is $1140, trap 1 disabled */
	ACCW(PR_CTX0_7, 0x00000000); /* extra */
	ACCW(PR_CTX1_7, 0x00000000); /* extra */
	/* setup set '5' for cmd NV4_GDI_RECTANGLE_TEXT */
	ACCW(PR_CTX0_8, 0x0208004a); /* NVclass $04a, patchcfg ROP_AND, nv10+: little endian */
	ACCW(PR_CTX1_8, 0x02000000); /* colorspace not set, notify instance is $0200 (b16-31) */
	ACCW(PR_CTX2_8, 0x00000000); /* DMA0 and DMA1 instance invalid */
	ACCW(PR_CTX3_8, 0x00000000); /* method traps disabled */
	ACCW(PR_CTX0_9, 0x00000000); /* extra */
	ACCW(PR_CTX1_9, 0x00000000); /* extra */
	/* setup set '6' for cmd NV10_CONTEXT_SURFACES_2D */
	ACCW(PR_CTX0_A, 0x02080062); /* NVclass $062, nv10+: little endian */
	ACCW(PR_CTX1_A, 0x00000000); /* colorspace not set, notify instance invalid (b16-31) */
	ACCW(PR_CTX2_A, 0x00001140); /* DMA0 instance is $1140, DMA1 instance invalid */
	ACCW(PR_CTX3_A, 0x00001140); /* method trap 0 is $1140, trap 1 disabled */
	ACCW(PR_CTX0_B, 0x00000000); /* extra */
	ACCW(PR_CTX1_B, 0x00000000); /* extra */
	/* setup set '7' for cmd NV_SCALED_IMAGE_FROM_MEMORY */
	ACCW(PR_CTX0_C, 0x02080077); /* NVclass $077, nv10+: little endian */
	ACCW(PR_CTX1_C, 0x00000000); /* colorspace not set, notify instance invalid (b16-31) */
	ACCW(PR_CTX2_C, 0x00001140); /* DMA0 instance is $1140, DMA1 instance invalid */
	ACCW(PR_CTX3_C, 0x00001140); /* method trap 0 is $1140, trap 1 disabled */
	ACCW(PR_CTX0_D, 0x00000000); /* extra */
	ACCW(PR_CTX1_D, 0x00000000); /* extra */
	/* setup DMA set pointed at by PF_CACH1_DMAI */
	ACCW(PR_CTX0_E, 0x00003002); /* DMA page table present and of linear type;
								  * DMA class is $002 (b0-11);
								  * DMA target node is NVM (non-volatile memory?)
								  * (instead of doing PCI or AGP transfers) */
	ACCW(PR_CTX1_E, 0x00007fff); /* DMA limit: tablesize is 32k bytes */
	ACCW(PR_CTX2_E, (((si->ps.memory_size - 1) & 0xffff8000) | 0x00000002));
								 /* DMA access type is READ_AND_WRITE;
								  * table is located at end of cardRAM (b12-31):
								  * It's adress needs to be at a 4kb boundary! */

	/* do a explicit engine reset */
	ACCW(DEBUG0, 0xffffffff);
	ACCW(DEBUG0, 0x00000000);
	/* disable all acceleration engine INT reguests */
	ACCW(ACC_INTE, 0x00000000);
	/* reset all acceration engine INT status bits */
	ACCW(ACC_INTS, 0xffffffff);
	/* context control enabled */
	ACCW(NV10_CTX_CTRL, 0x10010100);
	/* all acceleration buffers, pitches and colors are valid */
	ACCW(NV10_ACC_STAT, 0xffffffff);
	/* enable acceleration engine command FIFO */
	ACCW(FIFO_EN, 0x00000001);
	/* setup surface type:
	 * b1-0 = %01 = surface type is non-swizzle;
	 * this is needed to enable 3D on NV1x (confirmed) and maybe others? */
	ACCW(NV10_SURF_TYP, ((ACCR(NV10_SURF_TYP)) & 0x0007ff00));
	ACCW(NV10_SURF_TYP, ((ACCR(NV10_SURF_TYP)) | 0x00020101));

	/* init some function blocks */
	ACCW(DEBUG1, 0x401287c0);
	ACCW(DEBUG3, 0x60de8051);
	/* disable specific functions, but enable SETUP_SPARE2 register */
	ACCW(NV10_DEBUG4, 0x00008000);
	/* set limit_viol_pix_adress(?): more likely something unknown.. */
	ACCW(NV25_WHAT0, 0x00be3c5f);

	/* setup some unknown serially accessed registers (?) */
	tmp = (NV_REG32(NV32_NV4X_WHAT0) & 0x000000ff);
	for (cnt = 0; (tmp && !(tmp & 0x00000001)); tmp >>= 1, cnt++);
	{
		ACCW(NV4X_WHAT2, cnt);
	}

	/* unknown.. */
	switch (si->ps.card_type)
	{
	case NV40:
	case NV45:
	/* and NV48: but these are pgm'd as NV45 currently */
		ACCW(NV40_WHAT0, 0x83280fff);
		ACCW(NV40_WHAT1, 0x000000a0);
		ACCW(NV40_WHAT2, 0x0078e366);
		ACCW(NV40_WHAT3, 0x0000014c);
		break;
	case NV41:
	/* and ID == 0x012x: but no cards defined yet */
		ACCW(NV40P_WHAT0, 0x83280eff);
		ACCW(NV40P_WHAT1, 0x000000a0);
		ACCW(NV40P_WHAT2, 0x007596ff);
		ACCW(NV40P_WHAT3, 0x00000108);
		break;
	case NV43:
		ACCW(NV40P_WHAT0, 0x83280eff);
		ACCW(NV40P_WHAT1, 0x000000a0);
		ACCW(NV40P_WHAT2, 0x0072cb77);
		ACCW(NV40P_WHAT3, 0x00000108);
		break;
	case NV44:
	case G72:
		ACCW(NV40P_WHAT0, 0x83280eff);
		ACCW(NV40P_WHAT1, 0x000000a0);

		NV_REG32(NV32_NV44_WHAT10) = NV_REG32(NV32_NV10STRAPINFO);
		NV_REG32(NV32_NV44_WHAT11) = 0x00000000;
		NV_REG32(NV32_NV44_WHAT12) = 0x00000000;
		NV_REG32(NV32_NV44_WHAT13) = NV_REG32(NV32_NV10STRAPINFO);

		ACCW(NV44_WHAT2, 0x00000000);
		ACCW(NV44_WHAT3, 0x00000000);
			break;
/*	case NV44 type 2: (cardID 0x022x)
		//fixme if needed: doesn't seem to need the strapinfo thing..
		ACCW(NV40P_WHAT0, 0x83280eff);
		ACCW(NV40P_WHAT1, 0x000000a0);

		ACCW(NV44_WHAT2, 0x00000000);
		ACCW(NV44_WHAT3, 0x00000000);
		break;
*/	case G70:
	case G71:
	case G73:
		ACCW(NV40P_WHAT0, 0x83280eff);
		ACCW(NV40P_WHAT1, 0x000000a0);
		ACCW(NV40P_WHAT2, 0x07830610);
		ACCW(NV40P_WHAT3, 0x0000016a);
		break;
	default:
		ACCW(NV40P_WHAT0, 0x83280eff);
		ACCW(NV40P_WHAT1, 0x000000a0);
		break;
	}

	ACCW(NV10_TIL3PT, 0x2ffff800);
	ACCW(NV10_TIL3ST, 0x00006000);
	ACCW(NV4X_WHAT1, 0x01000000);
	/* engine data source DMA instance = $1140 */
	ACCW(NV4X_DMA_SRC, 0x00001140);

	/* copy tile setup stuff from previous setup 'source' to acc engine
	 * (pattern colorRAM?) */
	if ((si->ps.card_type == NV40) || (si->ps.card_type == NV45))
	{
		for (cnt = 0; cnt < 32; cnt++)
		{
			/* copy NV10_FBTIL0AD upto/including NV10_FBTIL7ST */
			NV_REG32(NVACC_NV20_WHAT0 + (cnt << 2)) =
				NV_REG32(NVACC_NV10_FBTIL0AD + (cnt << 2));

			/* copy NV10_FBTIL0AD upto/including NV10_FBTIL7ST */
			NV_REG32(NVACC_NV20_2_WHAT0 + (cnt << 2)) =
				NV_REG32(NVACC_NV10_FBTIL0AD + (cnt << 2));
		}
	}
	else
	{
		/* NV41, 43, 44, G70 and later */
		if (si->ps.card_type >= G70)
		{
			for (cnt = 0; cnt < 60; cnt++)
			{
				/* copy NV41_FBTIL0AD upto/including G70_FBTILEST */
				NV_REG32(NVACC_NV41_WHAT0 + (cnt << 2)) =
					NV_REG32(NVACC_NV41_FBTIL0AD + (cnt << 2));

				/* copy NV41_FBTIL0AD upto/including G70_FBTILEST */
				NV_REG32(NVACC_NV20_2_WHAT0 + (cnt << 2)) =
					NV_REG32(NVACC_NV41_FBTIL0AD + (cnt << 2));
			}
		}
		else
		{
			/* NV41, 43, 44 */
			for (cnt = 0; cnt < 48; cnt++)
			{
				/* copy NV41_FBTIL0AD upto/including NV41_FBTILBST */
				NV_REG32(NVACC_NV20_WHAT0 + (cnt << 2)) =
					NV_REG32(NVACC_NV41_FBTIL0AD + (cnt << 2));

				if (si->ps.card_type != NV44)
				{
					/* copy NV41_FBTIL0AD upto/including NV41_FBTILBST */
					NV_REG32(NVACC_NV20_2_WHAT0 + (cnt << 2)) =
						NV_REG32(NVACC_NV41_FBTIL0AD + (cnt << 2));
				}
			}
		}
	}

	if ((si->ps.card_type == NV40) || (si->ps.card_type == NV45))
	{
		/* copy some RAM configuration info(?) */
		ACCW(NV20_WHAT_T0, NV_REG32(NV32_PFB_CONFIG_0));
		ACCW(NV20_WHAT_T1, NV_REG32(NV32_PFB_CONFIG_1));
		ACCW(NV40_WHAT_T2, NV_REG32(NV32_PFB_CONFIG_0));
		ACCW(NV40_WHAT_T3, NV_REG32(NV32_PFB_CONFIG_1));

		/* setup location of active screen in framebuffer */
		ACCW(NV20_OFFSET0, ((uint8*)si->fbc.frame_buffer - (uint8*)si->framebuffer));
		ACCW(NV20_OFFSET1, ((uint8*)si->fbc.frame_buffer - (uint8*)si->framebuffer));
		/* setup accesible card memory range */
		ACCW(NV20_BLIMIT6, (si->ps.memory_size - 1));
		ACCW(NV20_BLIMIT7, (si->ps.memory_size - 1));
	}
	else
	{
		/* NV41, 43, 44, G70 and later */

		/* copy some RAM configuration info(?) */
		if (si->ps.card_type >= G70)
		{
			ACCW(G70_WHAT_T0, NV_REG32(NV32_PFB_CONFIG_0));
			ACCW(G70_WHAT_T1, NV_REG32(NV32_PFB_CONFIG_1));
		}
		else
		{
			/* NV41, 43, 44 */
			ACCW(NV40P_WHAT_T0, NV_REG32(NV32_PFB_CONFIG_0));
			ACCW(NV40P_WHAT_T1, NV_REG32(NV32_PFB_CONFIG_1));
		}
		ACCW(NV40P_WHAT_T2, NV_REG32(NV32_PFB_CONFIG_0));
		ACCW(NV40P_WHAT_T3, NV_REG32(NV32_PFB_CONFIG_1));

		/* setup location of active screen in framebuffer */
		ACCW(NV40P_OFFSET0, ((uint8*)si->fbc.frame_buffer - (uint8*)si->framebuffer));
		ACCW(NV40P_OFFSET1, ((uint8*)si->fbc.frame_buffer - (uint8*)si->framebuffer));
		/* setup accesible card memory range */
		ACCW(NV40P_BLIMIT6, (si->ps.memory_size - 1));
		ACCW(NV40P_BLIMIT7, (si->ps.memory_size - 1));
	}

	/* setup some acc engine tile stuff */
	ACCW(NV10_TIL2AD, 0x00000000);
	ACCW(NV10_TIL0ED, 0xffffffff);

	/* all cards: */
	/* setup clipping: rect size is 32768 x 32768, probably max. setting */
	/* note:
	 * can also be done via the NV_IMAGE_BLACK_RECTANGLE engine command. */
	ACCW(ABS_UCLP_XMIN, 0x00000000);
	ACCW(ABS_UCLP_YMIN, 0x00000000);
	ACCW(ABS_UCLP_XMAX, 0x00007fff);
	ACCW(ABS_UCLP_YMAX, 0x00007fff);

	/* setup sync parameters for NV12_IMAGE_BLIT command for the current mode:
	 * values given are CRTC vertical counter limit values. The NV12 command will wait
	 * for the specified's CRTC's vertical counter to be in between the given values */
	ACCW(NV11_CRTC_LO, si->dm.timing.v_display - 1);
	ACCW(NV11_CRTC_HI, si->dm.timing.v_display + 1);

	/*** PFIFO ***/
	/* (setup caches) */
	/* disable caches reassign */
	ACCW(PF_CACHES, 0x00000000);
	/* PFIFO mode: channel 0 is in DMA mode, channels 1 - 32 are in PIO mode */
	ACCW(PF_MODE, 0x00000001);
	/* cache1 push0 access disabled */
	ACCW(PF_CACH1_PSH0, 0x00000000);
	/* cache1 pull0 access disabled */
	ACCW(PF_CACH1_PUL0, 0x00000000);
	/* cache1 push1 mode = DMA */
	ACCW(PF_CACH1_PSH1, 0x00010000);
	/* cache1 DMA Put offset = 0 (b2-28) */
	ACCW(PF_CACH1_DMAP, 0x00000000);
	/* cache1 DMA Get offset = 0 (b2-28) */
	ACCW(PF_CACH1_DMAG, 0x00000000);
	/* cache1 DMA instance adress = $114e (b0-15);
	 * instance being b4-19 with baseadress NV_PRAMIN_CTX_0 (0x00700000). */
	/* note:
	 * should point to a DMA definition in CTX register space (which is sort of RAM).
	 * This define tells the engine where the DMA cmd buffer is and what it's size is.
	 * Inside that cmd buffer you'll find the actual issued engine commands. */
	ACCW(PF_CACH1_DMAI, 0x00001150);
	/* cache0 push0 access disabled */
	ACCW(PF_CACH0_PSH0, 0x00000000);
	/* cache0 pull0 access disabled */
	ACCW(PF_CACH0_PUL0, 0x00000000);
	/* RAM HT (hash table) baseadress = $10000 (b4-8), size = 4k,
	 * search = 128 (is byte offset between hash 'sets') */
	/* note:
	 * so HT base is $00710000, last is $00710fff.
	 * In this space you define the engine command handles (HT_HANDL_XX), which
	 * in turn points to the defines in CTX register space (which is sort of RAM) */
	ACCW(PF_RAMHT, 0x03000100);
	/* RAM FC baseadress = $11000 (b3-8) (size is fixed to 0.5k(?)) */
	/* note:
	 * so FC base is $00711000, last is $007111ff. (not used?) */
	ACCW(PF_RAMFC, 0x00000110);
	/* RAM RO baseadress = $11200 (b1-8), size = 0.5k */
	/* note:
	 * so RO base is $00711200, last is $007113ff. (not used?) */
	/* note also:
	 * This means(?) the PRAMIN CTX registers are accessible from base $00711400. */
	ACCW(PF_RAMRO, 0x00000112);
	/* PFIFO size: ch0-15 = 512 bytes, ch16-31 = 124 bytes */
	ACCW(PF_SIZE, 0x0000ffff);
	/* cache1 hash instance = $ffff (b0-15) */
	ACCW(PF_CACH1_HASH, 0x0000ffff);
	/* disable all PFIFO INTs */
	ACCW(PF_INTEN, 0x00000000);
	/* reset all PFIFO INT status bits */
	ACCW(PF_INTSTAT, 0xffffffff);
	/* cache0 pull0 engine = acceleration engine (graphics) */
	ACCW(PF_CACH0_PUL1, 0x00000001);
	/* cache1 DMA control: disable some stuff */
	ACCW(PF_CACH1_DMAC, 0x00000000);
	/* cache1 engine 0 upto/including 7 is software (could also be graphics or DVD) */
	ACCW(PF_CACH1_ENG, 0x00000000);
	/* cache1 DMA fetch: trigger at 128 bytes, size is 32 bytes, max requests is 15,
	 * use little endian */
	ACCW(PF_CACH1_DMAF, 0x000f0078);
	/* cache1 DMA push: b0 = 1: access is enabled */
	ACCW(PF_CACH1_DMAS, 0x00000001);
	/* cache1 push0 access enabled */
	ACCW(PF_CACH1_PSH0, 0x00000001);
	/* cache1 pull0 access enabled */
	ACCW(PF_CACH1_PUL0, 0x00000001);
	/* cache1 pull1 engine = acceleration engine (graphics) */
	ACCW(PF_CACH1_PUL1, 0x00000001);
	/* enable PFIFO caches reassign */
	ACCW(PF_CACHES, 0x00000001);

	/*** init acceleration engine command info ***/
	/* set object handles */
	/* note:
	 * probably depending on some other setup, there are 8 or 32 FIFO channels
	 * available. Assuming the current setup only has 8 channels because the 'rest'
	 * isn't setup here... */
	si->engine.fifo.handle[0] = NV_ROP5_SOLID;
	si->engine.fifo.handle[1] = NV_IMAGE_BLACK_RECTANGLE;
	si->engine.fifo.handle[2] = NV_IMAGE_PATTERN;
	si->engine.fifo.handle[3] = NV4_SURFACE; /* NV10_CONTEXT_SURFACES_2D is identical */
	si->engine.fifo.handle[4] = NV_IMAGE_BLIT;
	si->engine.fifo.handle[5] = NV4_GDI_RECTANGLE_TEXT;
	/* not used currently */
	si->engine.fifo.handle[6] = 0;
	si->engine.fifo.handle[7] = 0;
	/* preset no FIFO channels assigned to cmd's */
	for (cnt = 0; cnt < 0x20; cnt++)
	{
		si->engine.fifo.ch_ptr[cnt] = 0;
	}
	/* set handle's pointers to their assigned FIFO channels */
	/* note:
	 * b0-1 aren't used as adressbits. Using b0 to indicate a valid pointer. */
	for (cnt = 0; cnt < 0x08; cnt++)
	{
		si->engine.fifo.ch_ptr[(si->engine.fifo.handle[cnt])] =
												(0x00000001 + (cnt * 0x00002000));
	}

	/*** init DMA command buffer info ***/
	si->dma_buffer = (void *)((char *)si->framebuffer +
		((si->ps.memory_size - 1) & 0xffff8000));

	LOG(4,("ACC_DMA: command buffer is at adress $%08x\n",
		((uint32)(si->dma_buffer))));
	/* we have issued no DMA cmd's to the engine yet */
	si->engine.dma.put = 0;
	/* the current first free adress in the DMA buffer is at offset 0 */
	si->engine.dma.current = 0;
	/* the DMA buffer can hold 8k 32-bit words (it's 32kb in size). */
	/* note:
	 * one word is reserved at the end of the DMA buffer to be able to instruct the
	 * engine to do a buffer wrap-around!
	 * (DMA opcode 'noninc method': issue word $20000000.) */
	si->engine.dma.max = 8192 - 1;
	/* note the current free space we have left in the DMA buffer */
	si->engine.dma.free = si->engine.dma.max - si->engine.dma.current;

	/*** init FIFO via DMA command buffer. ***/
	/* wait for room in fifo for new FIFO assigment cmds if needed: */
	if (nv_acc_fifofree_dma(12) != B_OK) return B_ERROR;

	/* program new FIFO assignments */
	/* Raster OPeration: */
	nv_acc_set_ch_dma(NV_GENERAL_FIFO_CH0, si->engine.fifo.handle[0]);
	/* Clip: */
	nv_acc_set_ch_dma(NV_GENERAL_FIFO_CH1, si->engine.fifo.handle[1]);
	/* Pattern: */
	nv_acc_set_ch_dma(NV_GENERAL_FIFO_CH2, si->engine.fifo.handle[2]);
	/* 2D Surfaces: */
	nv_acc_set_ch_dma(NV_GENERAL_FIFO_CH3, si->engine.fifo.handle[3]);
	/* Blit: */
	nv_acc_set_ch_dma(NV_GENERAL_FIFO_CH4, si->engine.fifo.handle[4]);
	/* Bitmap: */
	nv_acc_set_ch_dma(NV_GENERAL_FIFO_CH5, si->engine.fifo.handle[5]);

	/*** Set pixel width ***/
	switch(si->dm.space)
	{
	case B_CMAP8:
		surf_depth = 0x00000001;
		cmd_depth = 0x00000003;
		break;
	case B_RGB15_LITTLE:
	case B_RGB16_LITTLE:
		surf_depth = 0x00000004;
		cmd_depth = 0x00000001;
		break;
	case B_RGB32_LITTLE:
	case B_RGBA32_LITTLE:
		surf_depth = 0x00000006;
		cmd_depth = 0x00000003;
		break;
	default:
		LOG(8,("ACC_DMA: init, invalid bit depth\n"));
		return B_ERROR;
	}

	/* wait for room in fifo for surface setup cmd if needed */
	if (nv_acc_fifofree_dma(5) != B_OK) return B_ERROR;
	/* now setup 2D surface (writing 5 32bit words) */
	nv_acc_cmd_dma(NV4_SURFACE, NV4_SURFACE_FORMAT, 4);
	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = surf_depth; /* Format */
	/* setup screen pitch */
	((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
		((si->fbc.bytes_per_row & 0x0000ffff) | (si->fbc.bytes_per_row << 16)); /* Pitch */
	/* setup screen location */
	((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
		((uint8*)si->fbc.frame_buffer - (uint8*)si->framebuffer); /* OffsetSource */
	((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
		((uint8*)si->fbc.frame_buffer - (uint8*)si->framebuffer); /* OffsetDest */

	/* wait for room in fifo for pattern colordepth setup cmd if needed */
	if (nv_acc_fifofree_dma(2) != B_OK) return B_ERROR;
	/* set pattern colordepth (writing 2 32bit words) */
	nv_acc_cmd_dma(NV_IMAGE_PATTERN, NV_IMAGE_PATTERN_SETCOLORFORMAT, 1);
	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = cmd_depth; /* SetColorFormat */

	/* wait for room in fifo for bitmap colordepth setup cmd if needed */
	if (nv_acc_fifofree_dma(2) != B_OK) return B_ERROR;
	/* set bitmap colordepth (writing 2 32bit words) */
	nv_acc_cmd_dma(NV4_GDI_RECTANGLE_TEXT, NV4_GDI_RECTANGLE_TEXT_SETCOLORFORMAT, 1);
	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = cmd_depth; /* SetColorFormat */

	/* Load our pattern into the engine: */
	/* wait for room in fifo for pattern cmd if needed. */
	if (nv_acc_fifofree_dma(7) != B_OK) return B_ERROR;
	/* now setup pattern (writing 7 32bit words) */
	nv_acc_cmd_dma(NV_IMAGE_PATTERN, NV_IMAGE_PATTERN_SETSHAPE, 1);
	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = 0x00000000; /* SetShape: 0 = 8x8, 1 = 64x1, 2 = 1x64 */
	nv_acc_cmd_dma(NV_IMAGE_PATTERN, NV_IMAGE_PATTERN_SETCOLOR0, 4);
	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = 0xffffffff; /* SetColor0 */
	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = 0xffffffff; /* SetColor1 */
	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = 0xffffffff; /* SetPattern[0] */
	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = 0xffffffff; /* SetPattern[1] */

	/* tell the engine to fetch and execute all (new) commands in the DMA buffer */
	nv_start_dma();

	return B_OK;
}

static void nv_start_dma(void)
{
	uint32 dummy;

	if (si->engine.dma.current != si->engine.dma.put)
	{
		si->engine.dma.put = si->engine.dma.current;
		/* dummy read the first adress of the framebuffer to flush MTRR-WC buffers */
		dummy = *((volatile uint32 *)(si->framebuffer));

		/* actually start DMA to execute all commands now in buffer */
		/* note:
		 * it doesn't matter which FIFO channel's DMA registers we access, they are in
		 * fact all the same set. It also doesn't matter if the channel was assigned a
		 * command or not. */
		/* note also:
		 * NV_GENERAL_DMAPUT is a write-only register on some cards (confirmed NV11). */
		NV_REG32(NVACC_FIFO + NV_GENERAL_DMAPUT) = (si->engine.dma.put << 2);
	}
}

/* this routine does not check the engine's internal hardware FIFO, but the DMA
 * command buffer. You can see this as a FIFO as well, that feeds the hardware FIFO.
 * The hardware FIFO state is checked by the DMA hardware automatically. */
static status_t nv_acc_fifofree_dma(uint16 cmd_size)
{
	uint32 dmaget;

	/* we'd better check for timeouts on the DMA engine as it's theoretically
	 * breakable by malfunctioning software */
	uint16 cnt = 0;

	/* check if the DMA buffer has enough room for the command.
	 * note:
	 * engine.dma.free is 'cached' */
	while ((si->engine.dma.free < cmd_size) && (cnt < 10000) && (err < 3))
	{
		/* see where the engine is currently fetching from the buffer */
		/* note:
		 * read this only once in the code as accessing registers is relatively slow */
		/* note also:
		 * it doesn't matter which FIFO channel's DMA registers we access, they are in
		 * fact all the same set. It also doesn't matter if the channel was assigned a
		 * command or not. */
		dmaget = ((NV_REG32(NVACC_FIFO + NV_GENERAL_DMAGET)) >> 2);

		/* update timeout counter: on NV11 on a Pentium4 2.8Ghz max reached count
		 * using BeRoMeter 1.2.6 was about 600; so counting 10000 before generating
		 * a timeout should definately do it. Snooze()-ing cannot be done without a
		 * serious speed penalty, even if done for only 1 microSecond. */
		cnt++;

		/* where's the engine fetching viewed from us issuing? */
		if (si->engine.dma.put >= dmaget)
		{
			/* engine is fetching 'behind us', the last piece of the buffer is free */

			/* note the 'updated' free space we have in the DMA buffer */
			si->engine.dma.free = si->engine.dma.max - si->engine.dma.current;
			/* if it's enough after all we exit this routine immediately. Else: */
			if (si->engine.dma.free < cmd_size)
			{
				/* not enough room left, so instruct DMA engine to reset the buffer
				 * when it's reaching the end of it */
				((uint32*)(si->dma_buffer))[si->engine.dma.current++] = 0x20000000;
				/* reset our buffer pointer, so new commands will be placed at the
				 * beginning of the buffer. */
				si->engine.dma.current = 0;
				/* tell the engine to fetch the remaining command(s) in the DMA buffer
				 * that where not executed before. */
				nv_start_dma();

				/* NOW the engine is fetching 'in front of us', so the first piece
				 * of the buffer is free */

				/* note the updated current free space we have in the DMA buffer */
				si->engine.dma.free = dmaget - si->engine.dma.current;
				/* mind this pittfall:
				 * Leave some room between where the engine is fetching and where we
				 * put new commands. Otherwise the engine will crash on heavy loads.
				 * A crash can be forced best in 640x480x32 mode with BeRoMeter 1.2.6.
				 * (confirmed on NV11 and NV43 with less than 256 words forced freespace.)
				 * Note:
				 * The engine is DMA triggered for fetching chunks every 128 bytes,
				 * maybe this is the reason for this behaviour.
				 * Note also:
				 * it looks like the space that needs to be kept free is coupled
				 * with the size of the DMA buffer. */
				if (si->engine.dma.free < 256)
					si->engine.dma.free = 0;
				else
					si->engine.dma.free -= 256;
			}
		}
		else
		{
			/* engine is fetching 'in front of us', so the first piece of the buffer
			 * is free */

			/* note the updated current free space we have in the DMA buffer */
			si->engine.dma.free = dmaget - si->engine.dma.current;
			/* mind this pittfall:
			 * Leave some room between where the engine is fetching and where we
			 * put new commands. Otherwise the engine will crash on heavy loads.
			 * A crash can be forced best in 640x480x32 mode with BeRoMeter 1.2.6.
			 * (confirmed on NV11 and NV43 with less than 256 words forced freespace.)
			 * Note:
			 * The engine is DMA triggered for fetching chunks every 128 bytes,
			 * maybe this is the reason for this behaviour.
			 * Note also:
			 * it looks like the space that needs to be kept free is coupled
			 * with the size of the DMA buffer. */
			if (si->engine.dma.free < 256)
				si->engine.dma.free = 0;
			else
				si->engine.dma.free -= 256;
		}
	}

	/* log timeout if we had one */
	if (cnt == 10000)
	{
		if (err < 3) err++;
		LOG(4,("ACC_DMA: fifofree; DMA timeout #%d, engine trouble!\n", err));
	}

	/* we must make the acceleration routines abort or the driver will hang! */
	if (err >= 3) return B_ERROR;

	return B_OK;
}

static void nv_acc_cmd_dma(uint32 cmd, uint16 offset, uint16 size)
{
	/* NV_FIFO_DMA_OPCODE: set number of cmd words (b18 - 28); set FIFO offset for
	 * first cmd word (b2 - 15); set DMA opcode = method (b29 - 31).
	 * a 'NOP' is the opcode word $00000000. */
	/* note:
	 * possible DMA opcodes:
	 * b'000' is 'method' (execute cmd);
	 * b'001' is 'jump';
	 * b'002' is 'noninc method' (execute buffer wrap-around);
	 * b'003' is 'call': return is executed by opcode word $00020000 (b17 = 1). */
	/* note also:
	 * this system uses auto-increments for the FIFO offset adresses. Make sure
	 * to set a new adress if a gap exists between the previous one and the new one. */
	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = ((size << 18) |
		((si->engine.fifo.ch_ptr[cmd] + offset) & 0x0000fffc));

	/* space left after issuing the current command is the cmd AND it's arguments less */
	si->engine.dma.free -= (size + 1);
}

static void nv_acc_set_ch_dma(uint16 ch, uint32 handle)
{
	/* issue FIFO channel assign cmd */
	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = ((1 << 18) | ch);
	/* set new assignment */
	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = (0x80000000 | handle);

	/* space left after issuing the current command is the cmd AND it's arguments less */
	si->engine.dma.free -= 2;
}

/* note:
 * switching fifo channel assignments this way has no noticable slowdown:
 * measured 0.2% with Quake2. */
void nv_acc_assert_fifo_dma(void)
{
	/* does every engine cmd this accelerant needs have a FIFO channel? */
	//fixme: can probably be optimized for both speed and channel selection...
	if (!si->engine.fifo.ch_ptr[NV_ROP5_SOLID] ||
		!si->engine.fifo.ch_ptr[NV_IMAGE_BLACK_RECTANGLE] ||
		!si->engine.fifo.ch_ptr[NV_IMAGE_PATTERN] ||
		!si->engine.fifo.ch_ptr[NV4_SURFACE] ||
		!si->engine.fifo.ch_ptr[NV_IMAGE_BLIT] ||
		!si->engine.fifo.ch_ptr[NV4_GDI_RECTANGLE_TEXT] ||
		!si->engine.fifo.ch_ptr[NV_SCALED_IMAGE_FROM_MEMORY])
	{
		uint16 cnt;

		/* free the FIFO channels we want from the currently assigned cmd's */
		si->engine.fifo.ch_ptr[si->engine.fifo.handle[0]] = 0;
		si->engine.fifo.ch_ptr[si->engine.fifo.handle[1]] = 0;
		si->engine.fifo.ch_ptr[si->engine.fifo.handle[2]] = 0;
		si->engine.fifo.ch_ptr[si->engine.fifo.handle[3]] = 0;
		si->engine.fifo.ch_ptr[si->engine.fifo.handle[4]] = 0;
		si->engine.fifo.ch_ptr[si->engine.fifo.handle[5]] = 0;
		si->engine.fifo.ch_ptr[si->engine.fifo.handle[6]] = 0;

		/* set new object handles */
		si->engine.fifo.handle[0] = NV_ROP5_SOLID;
		si->engine.fifo.handle[1] = NV_IMAGE_BLACK_RECTANGLE;
		si->engine.fifo.handle[2] = NV_IMAGE_PATTERN;
		si->engine.fifo.handle[3] = NV4_SURFACE;
		si->engine.fifo.handle[4] = NV_IMAGE_BLIT;
		si->engine.fifo.handle[5] = NV4_GDI_RECTANGLE_TEXT;
		si->engine.fifo.handle[6] = NV_SCALED_IMAGE_FROM_MEMORY;

		/* set handle's pointers to their assigned FIFO channels */
		/* note:
		 * b0-1 aren't used as adressbits. Using b0 to indicate a valid pointer. */
		for (cnt = 0; cnt < 0x08; cnt++)
		{
			si->engine.fifo.ch_ptr[(si->engine.fifo.handle[cnt])] =
				(0x00000001 + (cnt * 0x00002000));
		}

		/* wait for room in fifo for new FIFO assigment cmds if needed. */
		if (nv_acc_fifofree_dma(14) != B_OK) return;

		/* program new FIFO assignments */
		/* Raster OPeration: */
		nv_acc_set_ch_dma(NV_GENERAL_FIFO_CH0, si->engine.fifo.handle[0]);
		/* Clip: */
		nv_acc_set_ch_dma(NV_GENERAL_FIFO_CH1, si->engine.fifo.handle[1]);
		/* Pattern: */
		nv_acc_set_ch_dma(NV_GENERAL_FIFO_CH2, si->engine.fifo.handle[2]);
		/* 2D Surface: */
		nv_acc_set_ch_dma(NV_GENERAL_FIFO_CH3, si->engine.fifo.handle[3]);
		/* Blit: */
		nv_acc_set_ch_dma(NV_GENERAL_FIFO_CH4, si->engine.fifo.handle[4]);
		/* Bitmap: */
		nv_acc_set_ch_dma(NV_GENERAL_FIFO_CH5, si->engine.fifo.handle[5]);
		/* Scaled and fitered Blit: */
		nv_acc_set_ch_dma(NV_GENERAL_FIFO_CH6, si->engine.fifo.handle[6]);

		/* tell the engine to fetch and execute all (new) commands in the DMA buffer */
		nv_start_dma();
	}
}

/*
	note:
	moved acceleration 'top-level' routines to be integrated in the engine:
	it is costly to call the engine for every single function within a loop!
	(measured with BeRoMeter 1.2.6: upto 15% speed increase on all CPU's.)

	note also:
	splitting up each command list into sublists (see routines below) prevents
	a lot more nested calls, further increasing the speed with upto 70%.
	
	finally:
	sending the sublist to just one single engine command even further increases
	speed with upto another 10%. This can't be done for blits though, as this engine-
	command's hardware does not support multiple objects.
*/

/* screen to screen blit - i.e. move windows around and scroll within them. */
void SCREEN_TO_SCREEN_BLIT_DMA(engine_token *et, blit_params *list, uint32 count)
{
	uint32 i = 0;
	uint16 subcnt;

	/*** init acc engine for blit function ***/
	/* ROP registers (Raster OPeration):
	 * wait for room in fifo for ROP cmd if needed. */
	if (nv_acc_fifofree_dma(2) != B_OK) return;
	/* now setup ROP (writing 2 32bit words) for GXcopy */
	nv_acc_cmd_dma(NV_ROP5_SOLID, NV_ROP5_SOLID_SETROP5, 1);
	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = 0xcc; /* SetRop5 */

	/*** do each blit ***/
	/* Note:
	 * blit-copy direction is determined inside nvidia hardware: no setup needed */
	while (count)
	{
		/* break up the list in sublists to minimize calls, while making sure long
		 * lists still get executed without trouble */
		subcnt = 32;
		if (count < 32) subcnt = count;
		count -= subcnt;

		/* wait for room in fifo for blit cmd if needed. */
		if (nv_acc_fifofree_dma(4 * subcnt) != B_OK) return;

		while (subcnt--)
		{
			/* now setup blit (writing 4 32bit words) */
			nv_acc_cmd_dma(NV_IMAGE_BLIT, NV_IMAGE_BLIT_SOURCEORG, 3);
			((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
				(((list[i].src_top) << 16) | (list[i].src_left)); /* SourceOrg */
			((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
				(((list[i].dest_top) << 16) | (list[i].dest_left)); /* DestOrg */
			((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
				((((list[i].height) + 1) << 16) | ((list[i].width) + 1)); /* HeightWidth */

			i++;
		}

		/* tell the engine to fetch the commands in the DMA buffer that where not
		 * executed before. */
		nv_start_dma();
	}

	/* tell 3D add-ons that they should reload their rendering states and surfaces */
	si->engine.threeD.reload = 0xffffffff;
}

/* scaled and filtered screen to screen blit - i.e. video playback without overlay */
/* note: source and destination may not overlap. */
//fixme? checkout NV5 and NV10 version of cmd: faster?? (or is 0x77 a 'autoselect' version?)
void SCREEN_TO_SCREEN_SCALED_FILTERED_BLIT_DMA(engine_token *et, scaled_blit_params *list, uint32 count)
{
	uint32 i = 0;
	uint16 subcnt;
	uint32 cmd_depth;
	uint8 bpp;

	/*** init acc engine for scaled filtered blit function ***/
	/* Set pixel width */
	switch(si->dm.space)
	{
	case B_RGB15_LITTLE:
		cmd_depth = 0x00000002;
		bpp = 2;
		break;
	case B_RGB16_LITTLE:
		cmd_depth = 0x00000007;
		bpp = 2;
		break;
	case B_RGB32_LITTLE:
	case B_RGBA32_LITTLE:
		cmd_depth = 0x00000004;
		bpp = 4;
		break;
	/* fixme sometime:
	 * we could do the spaces below if this function would be modified to be able
	 * to use a source outside of the desktop, i.e. using offscreen bitmaps... */
	case B_YCbCr422:
		cmd_depth = 0x00000005;
		bpp = 2;
		break;
	case B_YUV422:
		cmd_depth = 0x00000006;
		bpp = 2;
		break;
	default:
		/* note: this function does not support src or dest in the B_CMAP8 space! */
		//fixme: the NV10 version of this cmd supports B_CMAP8 src though... (checkout)
		LOG(8,("ACC_DMA: scaled_filtered_blit, invalid bit depth\n"));
		return;
	}

	/* modify surface depth settings for 15-bit colorspace so command works as intended */
	if (si->dm.space == B_RGB15_LITTLE)
	{
		/* wait for room in fifo for surface setup cmd if needed */
		if (nv_acc_fifofree_dma(2) != B_OK) return;
		/* now setup 2D surface (writing 1 32bit word) */
		nv_acc_cmd_dma(NV4_SURFACE, NV4_SURFACE_FORMAT, 1);
		((uint32*)(si->dma_buffer))[si->engine.dma.current++] = 0x00000002; /* Format */
	}

	/* program operation mode 'SRCcopy' */
	/* wait for room in fifo for cmds if needed. */
	if (nv_acc_fifofree_dma(5) != B_OK) return;
	/* now setup source bitmap colorspace */
	nv_acc_cmd_dma(NV_SCALED_IMAGE_FROM_MEMORY, NV_SCALED_IMAGE_FROM_MEMORY_SETCOLORFORMAT, 2);
	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = cmd_depth; /* SetColorFormat */
	/* now setup operation mode to SRCcopy */
	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = 0x00000003; /* SetOperation */

	/* now setup fill color (writing 2 32bit words) */
	nv_acc_cmd_dma(NV4_GDI_RECTANGLE_TEXT, NV4_GDI_RECTANGLE_TEXT_COLOR1A, 1);
	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = 0x00000000; /* Color1A */

	/*** do each blit ***/
	while (count)
	{
		/* break up the list in sublists to minimize calls, while making sure long
		 * lists still get executed without trouble */
		subcnt = 16;
		if (count < 16) subcnt = count;
		count -= subcnt;

		/* wait for room in fifo for blit cmd if needed. */
		if (nv_acc_fifofree_dma(12 * subcnt) != B_OK) return;

		while (subcnt--)
		{
			/* now setup blit (writing 12 32bit words) */
			nv_acc_cmd_dma(NV_SCALED_IMAGE_FROM_MEMORY, NV_SCALED_IMAGE_FROM_MEMORY_SOURCEORG, 6);
			/* setup dest clipping ref for blit (not used) (b0-15 = left, b16-31 = top) */
			((uint32*)(si->dma_buffer))[si->engine.dma.current++] = 0; /* SourceOrg */
			/* setup dest clipping size for blit */
			((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
				(((list[i].dest_height + 1) << 16) | (list[i].dest_width + 1)); /* SourceHeightWidth */
			((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
			/* setup destination location and size for blit */
				(((list[i].dest_top) << 16) | (list[i].dest_left)); /* DestOrg */
			((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
				(((list[i].dest_height + 1) << 16) | (list[i].dest_width + 1)); /* DestHeightWidth */
			//fixme: findout scaling limits... (although the current cmd interface doesn't support them.)
			((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
				(((list[i].src_width + 1) << 20) / (list[i].dest_width + 1)); /* HorInvScale (in 12.20 format) */
			((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
				(((list[i].src_height + 1) << 20) / (list[i].dest_height + 1)); /* VerInvScale (in 12.20 format) */

			nv_acc_cmd_dma(NV_SCALED_IMAGE_FROM_MEMORY, NV_SCALED_IMAGE_FROM_MEMORY_SOURCESIZE, 4);
			/* setup horizontal and vertical source (fetching) ends.
			 * note:
			 * horizontal granularity is 2 pixels, vertical granularity is 1 pixel.
			 * look at Matrox or Neomagic bes engines code for usage example. */
			//fixme: tested 15, 16 and 32-bit RGB depth, verify other depths...
			((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
				(((list[i].src_height + 1) << 16) |
				 (((list[i].src_width + 1) + 0x0001) & ~0x0001)); /* SourceHeightWidth */
			/* setup source pitch (b0-15). Set 'format origin center' (b16-17) and
			 * select 'format interpolator foh (bilinear filtering)' (b24). */
			((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
				(si->fbc.bytes_per_row | (1 << 16) | (1 << 24)); /* SourcePitch */
			/* setup source surface location */
			((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
				((uint32)((uint8*)si->fbc.frame_buffer - (uint8*)si->framebuffer)) +
				(list[i].src_top * si->fbc.bytes_per_row) +	(list[i].src_left * bpp); /* Offset */
			/* setup source start: first (sub)pixel contributing to output picture */
			/* note:
			 * clipping is not asked for.
			 * look at nVidia NV10+ bes engine code for useage example. */
			((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
				0; /* SourceRef (b0-15 = hor, b16-31 = ver: both in 12.4 format) */

			i++;
		}

		/* tell the engine to fetch the commands in the DMA buffer that where not
		 * executed before. */
		nv_start_dma();
	}

	/* reset surface depth settings so the other engine commands works as intended */
	if (si->dm.space == B_RGB15_LITTLE)
	{
		/* wait for room in fifo for surface setup cmd if needed */
		if (nv_acc_fifofree_dma(2) != B_OK) return;
		/* now setup 2D surface (writing 1 32bit word) */
		nv_acc_cmd_dma(NV4_SURFACE, NV4_SURFACE_FORMAT, 1);
		((uint32*)(si->dma_buffer))[si->engine.dma.current++] = 0x00000004; /* Format */

		/* tell the engine to fetch the commands in the DMA buffer that where not
		 * executed before. */
		nv_start_dma();
	}

	/* tell 3D add-ons that they should reload their rendering states and surfaces */
	si->engine.threeD.reload = 0xffffffff;
}

/* rectangle fill - i.e. workspace and window background color */
void FILL_RECTANGLE_DMA(engine_token *et, uint32 colorIndex, fill_rect_params *list, uint32 count)
{
	uint32 i = 0;
	uint16 subcnt;

	/*** init acc engine for fill function ***/
	/* ROP registers (Raster OPeration):
	 * wait for room in fifo for ROP and bitmap cmd if needed. */
	if (nv_acc_fifofree_dma(4) != B_OK) return;
	/* now setup ROP (writing 2 32bit words) for GXcopy */
	nv_acc_cmd_dma(NV_ROP5_SOLID, NV_ROP5_SOLID_SETROP5, 1);
	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = 0xcc; /* SetRop5 */
	/* now setup fill color (writing 2 32bit words) */
	nv_acc_cmd_dma(NV4_GDI_RECTANGLE_TEXT, NV4_GDI_RECTANGLE_TEXT_COLOR1A, 1);
	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = colorIndex; /* Color1A */

	/*** draw each rectangle ***/
	while (count)
	{
		/* break up the list in sublists to minimize calls, while making sure long
		 * lists still get executed without trouble */
		subcnt = 32;
		if (count < 32) subcnt = count;
		count -= subcnt;

		/* wait for room in fifo for bitmap cmd if needed. */
		if (nv_acc_fifofree_dma(1 + (2 * subcnt)) != B_OK) return;

		/* issue fill command once... */
		nv_acc_cmd_dma(NV4_GDI_RECTANGLE_TEXT, NV4_GDI_RECTANGLE_TEXT_UCR0_LEFTTOP, (2 * subcnt));
		/* ... and send multiple rects (engine cmd supports 32 max) */
		while (subcnt--)
		{
			((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
				(((list[i].left) << 16) | ((list[i].top) & 0x0000ffff)); /* Unclipped Rect 0 LeftTop */
			((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
				(((((list[i].right)+1) - (list[i].left)) << 16) |
				(((list[i].bottom-list[i].top)+1) & 0x0000ffff)); /* Unclipped Rect 0 WidthHeight */

			i++;
		}

		/* tell the engine to fetch the commands in the DMA buffer that where not
		 * executed before. */
		nv_start_dma();
	}

	/* tell 3D add-ons that they should reload their rendering states and surfaces */
	si->engine.threeD.reload = 0xffffffff;
}

/* span fill - i.e. (selected) menuitem background color (Dano) */
void FILL_SPAN_DMA(engine_token *et, uint32 colorIndex, uint16 *list, uint32 count)
{
	uint32 i = 0;
	uint16 subcnt;

	/*** init acc engine for fill function ***/
	/* ROP registers (Raster OPeration):
	 * wait for room in fifo for ROP and bitmap cmd if needed. */
	if (nv_acc_fifofree_dma(4) != B_OK) return;
	/* now setup ROP (writing 2 32bit words) for GXcopy */
	nv_acc_cmd_dma(NV_ROP5_SOLID, NV_ROP5_SOLID_SETROP5, 1);
	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = 0xcc; /* SetRop5 */
	/* now setup fill color (writing 2 32bit words) */
	nv_acc_cmd_dma(NV4_GDI_RECTANGLE_TEXT, NV4_GDI_RECTANGLE_TEXT_COLOR1A, 1);
	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = colorIndex; /* Color1A */

	/*** draw each span ***/
	while (count)
	{
		/* break up the list in sublists to minimize calls, while making sure long
		 * lists still get executed without trouble */
		subcnt = 32;
		if (count < 32) subcnt = count;
		count -= subcnt;

		/* wait for room in fifo for bitmap cmd if needed. */
		if (nv_acc_fifofree_dma(1 + (2 * subcnt)) != B_OK) return;

		/* issue fill command once... */
		nv_acc_cmd_dma(NV4_GDI_RECTANGLE_TEXT, NV4_GDI_RECTANGLE_TEXT_UCR0_LEFTTOP, (2 * subcnt));
		/* ... and send multiple rects (spans) (engine cmd supports 32 max) */
		while (subcnt--)
		{
			((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
				(((list[i+1]) << 16) | ((list[i]) & 0x0000ffff)); /* Unclipped Rect 0 LeftTop */
			((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
				((((list[i+2]+1) - (list[i+1])) << 16) | 0x00000001); /* Unclipped Rect 0 WidthHeight */

			i+=3;
		}

		/* tell the engine to fetch the commands in the DMA buffer that where not
		 * executed before. */
		nv_start_dma();
	}

	/* tell 3D add-ons that they should reload their rendering states and surfaces */
	si->engine.threeD.reload = 0xffffffff;
}

/* rectangle invert - i.e. text cursor and text selection */
void INVERT_RECTANGLE_DMA(engine_token *et, fill_rect_params *list, uint32 count)
{
	uint32 i = 0;
	uint16 subcnt;

	/*** init acc engine for invert function ***/
	/* ROP registers (Raster OPeration):
	 * wait for room in fifo for ROP and bitmap cmd if needed. */
	if (nv_acc_fifofree_dma(4) != B_OK) return;
	/* now setup ROP (writing 2 32bit words) for GXinvert */
	nv_acc_cmd_dma(NV_ROP5_SOLID, NV_ROP5_SOLID_SETROP5, 1);
	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = 0x55; /* SetRop5 */
	/* now reset fill color (writing 2 32bit words) */
	nv_acc_cmd_dma(NV4_GDI_RECTANGLE_TEXT, NV4_GDI_RECTANGLE_TEXT_COLOR1A, 1);
	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = 0x00000000; /* Color1A */

	/*** invert each rectangle ***/
	while (count)
	{
		/* break up the list in sublists to minimize calls, while making sure long
		 * lists still get executed without trouble */
		subcnt = 32;
		if (count < 32) subcnt = count;
		count -= subcnt;

		/* wait for room in fifo for bitmap cmd if needed. */
		if (nv_acc_fifofree_dma(1 + (2 * subcnt)) != B_OK) return;

		/* issue fill command once... */
		nv_acc_cmd_dma(NV4_GDI_RECTANGLE_TEXT, NV4_GDI_RECTANGLE_TEXT_UCR0_LEFTTOP, (2 * subcnt));
		/* ... and send multiple rects (engine cmd supports 32 max) */
		while (subcnt--)
		{
			((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
				(((list[i].left) << 16) | ((list[i].top) & 0x0000ffff)); /* Unclipped Rect 0 LeftTop */
			((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
				(((((list[i].right)+1) - (list[i].left)) << 16) |
				(((list[i].bottom-list[i].top)+1) & 0x0000ffff)); /* Unclipped Rect 0 WidthHeight */

			i++;
		}

		/* tell the engine to fetch the commands in the DMA buffer that where not
		 * executed before. */
		nv_start_dma();
	}

	/* tell 3D add-ons that they should reload their rendering states and surfaces */
	si->engine.threeD.reload = 0xffffffff;
}