1/*
2	Copyright (c) 2002, Thomas Kurschel
3
4
5	Part of Radeon accelerant
6
7	Hardware accelerator management
8
9	All accelerator commands go through the following steps:
10	- accelerant adds command to CP buffer and updates CP write pointer
11	- CP fetches command and sends it to MicroController
12	- MicroController instructs 2D unit to execute command
13	- 2D unit draws into 2D Destination Cache (DC)
14	- 2D Destination Cache is drained to frame buffer
15
16	Whenever a token is required by BeOS, a command is queued to write
17	the timestamp into Scratch Register 0. I haven't fully understand
18	when and how coherancy is assured by Radeon, so I assume the following:
19	- when the timestamp is written, all previous commands have been issued,
20	  i.e. they are read and executed by the microcontroller
21	- to make sure previously issued 2D commands have been finished,
22	  a WAIT_2D_IDLECLEAN command is inserted before the scratch register
23	  write
24	- to flush the destination cache, a RB2D_DC_FLUSH_ALL command is
25	  issued before the wait; I hope that the wait command also waits for
26	  the flush command, but I'm not sure about that
27
28	Remains the cache coherency problem. It you can set various bits in
29	DSTCACHE_MODE register to assure that, but first I don't really understand
30	them, and second I'm not sure which other caches/FIFO may make trouble.
31	Especially, Be wants to use CPU and CP accesses in parallel. Hopefully,
32	they don't interfere.
33
34	I know that the PAINT_MULTI commands makes trouble if you change the
35	ROP to something else: CPU writes produce garbage in frame buffer for the
36	next couple of accesses. Resetting the ROP to a simply copy helps, but
37	I'm not sure what happens with concurrent CPU accesses to other areas
38	of the frame buffer.
39*/
40
41
42#include "radeon_accelerant.h"
43#include "generic.h"
44#include "rbbm_regs.h"
45#include "GlobalData.h"
46#include "mmio.h"
47#include "CP.h"
48
49static engine_token radeon_engine_token = { 1, B_2D_ACCELERATION, NULL };
50
51// public function: return number of hardware engine
52uint32 ACCELERANT_ENGINE_COUNT(void)
53{
54	// hm, is there *any* card sporting more then
55	// one hardware accelerator???
56	return 1;
57}
58
59// write current sync token into CP stream;
60// we instruct the CP to flush all kind of cache first to not interfere
61// with subsequent host writes
62static void writeSyncToken( accelerator_info *ai )
63{
64	// don't write token if it hasn't changed since last write
65	if( ai->si->engine.count == ai->si->engine.written )
66		return;
67
68	if( ai->si->acc_dma ) {
69		START_IB();
70
71		// flush pending data
72		WRITE_IB_REG( RADEON_RB2D_DSTCACHE_CTLSTAT, RADEON_RB2D_DC_FLUSH_ALL );
73
74		// make sure commands are finished
75		WRITE_IB_REG( RADEON_WAIT_UNTIL, RADEON_WAIT_2D_IDLECLEAN |
76			RADEON_WAIT_3D_IDLECLEAN | RADEON_WAIT_HOST_IDLECLEAN );
77
78		// write scratch register
79		WRITE_IB_REG( RADEON_SCRATCH_REG0, ai->si->engine.count );
80
81		ai->si->engine.written = ai->si->engine.count;
82
83		SUBMIT_IB();
84	} else {
85		Radeon_WaitForFifo( ai, 2 );
86		OUTREG( ai->regs, RADEON_RB2D_DSTCACHE_CTLSTAT, RADEON_RB2D_DC_FLUSH_ALL);
87		OUTREG( ai->regs, RADEON_WAIT_UNTIL, RADEON_WAIT_2D_IDLECLEAN |
88		   RADEON_WAIT_3D_IDLECLEAN |
89		   RADEON_WAIT_HOST_IDLECLEAN);
90		ai->si->engine.written = ai->si->engine.count;
91	}
92}
93
94// public function: acquire engine for future use
95//	capabilites - required 2D/3D capabilities of engine, ignored
96//	max_wait - maximum time we want to wait (in ms?), ignored
97//	st - when engine has been acquired, wait for this sync token
98//	et - (out) specifier of the engine acquired
99status_t ACQUIRE_ENGINE( uint32 capabilities, uint32 max_wait,
100	sync_token *st, engine_token **et )
101{
102	shared_info *si = ai->si;
103
104	SHOW_FLOW0( 4, "" );
105
106	(void)capabilities;
107	(void)max_wait;
108
109	ACQUIRE_BEN( si->engine.lock)
110
111	// wait for sync
112	if (st)
113		SYNC_TO_TOKEN( st );
114
115	*et = &radeon_engine_token;
116	return B_OK;
117}
118
119// public function: release accelerator
120//	et - engine to release
121//	st - (out) sync token to be filled out
122status_t RELEASE_ENGINE( engine_token *et, sync_token *st )
123{
124	shared_info *si = ai->si;
125
126	SHOW_FLOW0( 4, "" );
127
128	// fill out sync token
129	if (st) {
130		writeSyncToken( ai );
131
132		st->engine_id = et->engine_id;
133		st->counter = si->engine.count;
134	}
135
136	RELEASE_BEN( ai->si->engine.lock )
137
138	return B_OK;
139}
140
141// public function: wait until engine is idle
142// ??? which engine to wait for? Is there anyone using this function?
143//     is lock hold?
144void WAIT_ENGINE_IDLE(void)
145{
146	SHOW_FLOW0( 4, "" );
147
148	Radeon_WaitForIdle( ai, false );
149}
150
151// public function: get sync token
152//	et - engine to wait for
153//	st - (out) sync token to be filled out
154status_t GET_SYNC_TOKEN( engine_token *et, sync_token *st )
155{
156	shared_info *si = ai->si;
157
158	SHOW_FLOW0( 4, "" );
159
160	writeSyncToken( ai );
161
162	st->engine_id = et->engine_id;
163	st->counter = si->engine.count;
164
165	SHOW_FLOW( 4, "got counter=%d", si->engine.count );
166
167	return B_OK;
168}
169
170// this is the same as the corresponding kernel function
171void Radeon_Spin( uint32 delay )
172{
173	bigtime_t start_time;
174
175	start_time = system_time();
176
177	while( system_time() - start_time < delay )
178		;
179}
180
181// public: sync to token
182//	st - token to wait for
183status_t SYNC_TO_TOKEN( sync_token *st )
184{
185	shared_info *si = ai->si;
186	bigtime_t start_time, sample_time;
187
188	SHOW_FLOW0( 4, "" );
189
190	if ( !ai->si->acc_dma )
191	{
192		Radeon_WaitForFifo( ai, 64 );
193		Radeon_WaitForIdle( ai, false );
194		return B_OK;
195	}
196
197	start_time = system_time();
198
199	while( 1 ) {
200		SHOW_FLOW( 4, "passed counter=%d",
201			((uint32 *)(ai->mapped_memory[si->cp.feedback.mem_type].data + si->cp.feedback.scratch_mem_offset))[0] );
202			//si->cp.scratch.ptr[0] );
203
204		// a bit nasty: counter is 64 bit, but we have 32 bit only,
205		// this is a tricky calculation to handle wrap-arounds correctly
206		if( (int32)(
207			((uint32 *)(ai->mapped_memory[si->cp.feedback.mem_type].data + si->cp.feedback.scratch_mem_offset))[0]
208			//si->cp.scratch.ptr[0]
209			- st->counter) >= 0 )
210			return B_OK;
211		/*if( (int32)(INREG( ai->regs, RADEON_SCRATCH_REG0 ) - st->counter) >= 0 )
212			return B_OK;*/
213
214		// commands have not been finished;
215		// this is a good time to free completed buffers as we have to
216		// busy-wait anyway
217		ACQUIRE_BEN( si->cp.lock );
218		Radeon_FreeIndirectBuffers( ai );
219		RELEASE_BEN( si->cp.lock );
220
221		sample_time = system_time();
222
223		if( sample_time - start_time > 100000 )
224			break;
225
226		// use exponential fall-off
227		// in the beginning do busy-waiting, later on we let thread sleep
228		// the micro-spin is used to reduce PCI load
229		if( sample_time - start_time > 5000 )
230			snooze( (sample_time - start_time) / 10 );
231		else
232			Radeon_Spin( 1 );
233	}
234
235	// we could reset engine now, but caller doesn't need to acquire
236	// engine before calling this function, so we either reset it
237	// without sync (ouch!) or acquire engine first and risk deadlocking
238	SHOW_ERROR( 0, "Failed waiting for token %d (active token: %d)",
239		st->counter, /*INREG( ai->regs, RADEON_SCRATCH_REG0 )*/
240		((uint32 *)(ai->mapped_memory[si->cp.feedback.mem_type].data + si->cp.feedback.scratch_mem_offset))[0] );
241		//si->cp.scratch.ptr[0] );
242
243	Radeon_ResetEngine( ai );
244
245	return B_ERROR;
246}
247