1/*
2	Copyright (c) 2002, Thomas Kurschel
3
4
5	Part of Radeon accelerant
6
7	Command Processor handling
8
9
10	Something about synchronization in general:
11
12	The DDK says that only some register accesses are stored in the
13	Command FIFO, i.e. in almost all cases you don't have to wait until
14	there is enough space in this FIFO. Unfortunately, ATI doesn't speak
15	clearly here and doesn't tell you which registers are buffered and
16	which not (the r300 DDK provides some examples only, other DDKs refer
17	to some include file where no such info could be found).
18
19	Looking at pre-Radeon specs, we have the following register ranges:
20		0		configuration/display/multi-media registers
21		0xf00	read-only PCI configuration space
22		0x1000	CCE registers
23		0x1400	FIFOed GUI-registers
24
25	So, if the list is still correct, the affected registers are only
26	those used for 2D/3D drawing.
27
28	This is very important as if the register you want to write is
29	buffered, you have to do a busy wait until there is enough FIFO
30	space. As concurrent threads may do the same, register access should
31	only be done with a lock held. We never write GUI-registers directly,
32	so we never have to wait for the FIFO and thus don't need this lock.
33
34*/
35
36#include "radeon_accelerant.h"
37#include "mmio.h"
38#include "buscntrl_regs.h"
39#include "utils.h"
40#include <sys/ioctl.h>
41#include "CP.h"
42
43#include "log_coll.h"
44#include "log_enum.h"
45
46#include <string.h>
47
48
49// get number of free entries in CP's ring buffer
50static uint getAvailRingBuffer( accelerator_info *ai )
51{
52	CP_info *cp = &ai->si->cp;
53	int space;
54
55	space =
56		*(uint32 *)(ai->mapped_memory[cp->feedback.mem_type].data + cp->feedback.head_mem_offset)
57		//*cp->ring.head
58		- cp->ring.tail;
59	//space = INREG( ai->regs, RADEON_CP_RB_RPTR ) - cp->ring.tail;
60
61	if( space <= 0 )
62		space += cp->ring.size;
63
64	// don't fill up the entire buffer as we cannot
65	// distinguish between a full and an empty ring
66	--space;
67
68	SHOW_FLOW( 3, "head=%ld, tail=%ld, space=%ld",
69		*(uint32 *)(ai->mapped_memory[cp->feedback.mem_type].data + cp->feedback.head_mem_offset),
70		//*cp->ring.head,
71		cp->ring.tail, space );
72
73	LOG1( si->log, _GetAvailRingBufferQueue, space );
74
75	cp->ring.space = space;
76
77	return space;
78}
79
80
81// mark all indirect buffers that have been processed as being free;
82// lock must be hold
83void Radeon_FreeIndirectBuffers( accelerator_info *ai )
84{
85	CP_info *cp = &ai->si->cp;
86	int32 cur_processed_tag =
87		((uint32 *)(ai->mapped_memory[cp->feedback.mem_type].data + cp->feedback.scratch_mem_offset))[1];
88		//ai->si->cp.scratch.ptr[1];
89	//INREG( ai->regs, RADEON_SCRATCH_REG1 );
90
91	SHOW_FLOW( 3, "processed_tag=%d", cur_processed_tag );
92
93	// mark all sent indirect buffers as free
94	while( cp->buffers.oldest != -1 ) {
95		indirect_buffer *oldest_buffer =
96			&cp->buffers.buffers[cp->buffers.oldest];
97		int tmp_oldest_buffer;
98
99		SHOW_FLOW( 3, "oldset buffer's tag: %d", oldest_buffer->send_tag );
100
101		// this is a tricky calculation to handle wrap-arounds correctly,
102		// so don't change it unless you really understand the signess problem
103		if( (int32)(cur_processed_tag - oldest_buffer->send_tag) < 0 )
104			break;
105
106		SHOW_FLOW( 3, "mark %d as being free", oldest_buffer->send_tag );
107
108		// remove buffer from "used" list
109		tmp_oldest_buffer = oldest_buffer->next;
110
111		if( tmp_oldest_buffer == -1 )
112			cp->buffers.newest = -1;
113
114		// put it on free list
115		oldest_buffer->next = cp->buffers.free_list;
116		cp->buffers.free_list = cp->buffers.oldest;
117
118		cp->buffers.oldest = tmp_oldest_buffer;
119	}
120}
121
122
123// wait until an indirect buffer becomes available;
124// lock must be hold
125static void Radeon_WaitForFreeIndirectBuffers( accelerator_info *ai )
126{
127	bigtime_t start_time;
128	CP_info *cp = &ai->si->cp;
129
130	SHOW_FLOW0( 3, "" );
131
132	start_time = system_time();
133
134	while( 1 ) {
135		bigtime_t sample_time;
136
137		Radeon_FreeIndirectBuffers( ai );
138
139		if( cp->buffers.free_list >= 0 )
140			return;
141
142		sample_time = system_time();
143
144		if( sample_time - start_time > 100000 )
145			break;
146
147		RELEASE_BEN( cp->lock );
148
149		// use exponential fall-off
150		// in the beginning do busy-waiting, later on we let the thread sleep;
151		// the micro-spin is used to reduce PCI load
152		if( sample_time - start_time > 5000 )
153			snooze( (sample_time - start_time) / 10 );
154		else
155			Radeon_Spin( 1 );
156
157		ACQUIRE_BEN( cp->lock );
158	}
159
160	SHOW_ERROR0( 0, "All buffers are in use and engine doesn't finish any of them" );
161
162	// lock must be released during reset (reset acquires it automatically)
163	RELEASE_BEN( cp->lock );
164	Radeon_ResetEngine( ai );
165	ACQUIRE_BEN( cp->lock );
166}
167
168// allocate an indirect buffer
169int Radeon_AllocIndirectBuffer( accelerator_info *ai, bool keep_lock )
170{
171	CP_info *cp = &ai->si->cp;
172	int buffer_idx;
173
174	SHOW_FLOW0( 3, "" );
175
176	ACQUIRE_BEN( cp->lock );
177
178	if( cp->buffers.free_list == -1 )
179		Radeon_WaitForFreeIndirectBuffers( ai );
180
181	buffer_idx = cp->buffers.free_list;
182	cp->buffers.free_list = cp->buffers.buffers[buffer_idx].next;
183
184	//if( !keep_lock )
185		RELEASE_BEN( cp->lock );
186	(void)keep_lock;
187
188	SHOW_FLOW( 3, "got %d", buffer_idx );
189
190	return buffer_idx;
191}
192
193
194// explicitely free an indirect buffer;
195// this is not needed if the buffer was send via SendIndirectBuffer()
196// never_used	- 	set to true if the buffer wasn't even sent indirectly
197//					as a state buffer
198// !Warning!
199// if never_used is false, execution may take very long as all buffers
200// must be flushed!
201void Radeon_FreeIndirectBuffer( accelerator_info *ai, int buffer_idx, bool never_used )
202{
203	CP_info *cp = &ai->si->cp;
204
205	SHOW_FLOW( 3, "buffer_idx=%d, never_used=%d", buffer_idx, never_used );
206
207	// if the buffer was used as a state buffer, we don't record its usage,
208	// so we don't know if the buffer was/is/will be used;
209	// the only way to be sure is to let the CP run dry
210	if( !never_used )
211		Radeon_WaitForIdle( ai, false );
212
213	ACQUIRE_BEN( cp->lock );
214
215	cp->buffers.buffers[buffer_idx].next = cp->buffers.free_list;
216	cp->buffers.free_list = buffer_idx;
217
218	RELEASE_BEN( cp->lock );
219
220	SHOW_FLOW0( 3, "done" );
221}
222
223// this function must be moved to end of file to avoid inlining
224void Radeon_WaitForRingBufferSpace( accelerator_info *ai, uint num_dwords );
225
226
227// start writing to ring buffer
228// num_dwords - number of dwords to write (must be precise!)
229// !Warning!
230// during wait, CP's benaphore is released
231#define WRITE_RB_START( num_dwords ) \
232	{ \
233		uint32 *ring_start; \
234		uint32 ring_tail, ring_tail_mask; \
235		uint32 ring_tail_increment = (num_dwords); \
236		if( cp->ring.space < ring_tail_increment ) \
237			Radeon_WaitForRingBufferSpace( ai, ring_tail_increment ); \
238		ring_start = \
239		(uint32 *)(ai->mapped_memory[cp->ring.mem_type].data + cp->ring.mem_offset); \
240			/*cp->ring.start;*/ \
241		ring_tail = cp->ring.tail; \
242		ring_tail_mask = cp->ring.tail_mask;
243
244// write single dword to ring buffer
245#define WRITE_RB( value ) \
246	{ \
247		uint32 val = (value); \
248		SHOW_FLOW( 3, "@%d: %x", ring_tail, val ); \
249		ring_start[ring_tail++] = val; \
250		ring_tail &= ring_tail_mask; \
251	}
252
253// finish writing to ring buffer
254#define WRITE_RB_FINISH \
255		cp->ring.tail = ring_tail; \
256		cp->ring.space -= ring_tail_increment; \
257	}
258
259// submit indirect buffer for execution.
260// the indirect buffer must not be used afterwards!
261// buffer_idx			- index of indirect buffer to submit
262// buffer_size  		- size of indirect buffer in 32 bits
263// state_buffer_idx		- index of indirect buffer to restore required state
264// state_buffer_size	- size of indirect buffer to restore required state
265// returns:				  tag of buffer (so you can wait for its execution)
266// if no special state is required, set state_buffer_size to zero
267void Radeon_SendIndirectBuffer( accelerator_info *ai,
268	int buffer_idx, int buffer_size,
269	int state_buffer_idx, int state_buffer_size, bool has_lock )
270{
271	CP_info *cp = &ai->si->cp;
272	bool need_stateupdate;
273
274	SHOW_FLOW( 3, "buffer_idx=%d, buffer_size=%d, state_buffer_idx=%d, state_buffer_size=%d",
275		buffer_idx, buffer_size, state_buffer_idx, state_buffer_size );
276
277	if( (buffer_size & 1) != 0 ) {
278		SHOW_FLOW( 3, "buffer has uneven size (%d)", buffer_size );
279		// size of indirect buffers _must_ be multiple of 64 bits, so
280		// add a nop to fulfil alignment
281		Radeon_GetIndirectBufferPtr( ai, buffer_idx )[buffer_size] = RADEON_CP_PACKET2;
282		buffer_size += 1;
283	}
284
285	//if( !has_lock )
286		ACQUIRE_BEN( cp->lock );
287	(void)has_lock;
288
289	need_stateupdate =
290		state_buffer_size > 0 && state_buffer_idx != cp->buffers.active_state;
291
292	WRITE_RB_START( 5 + (need_stateupdate ? 3 : 0) );
293
294	// if the indirect buffer to submit requires a special state and the
295	// hardware is in wrong state then execute state buffer
296	if( need_stateupdate ) {
297		SHOW_FLOW0( 3, "update state" );
298
299		WRITE_RB( CP_PACKET0( RADEON_CP_IB_BASE, 2 ));
300		WRITE_RB( cp->buffers.vm_start +
301			state_buffer_idx * INDIRECT_BUFFER_SIZE * sizeof( uint32 ));
302		WRITE_RB( state_buffer_size );
303
304		cp->buffers.active_state = state_buffer_idx;
305	}
306
307	// execute indirect buffer
308	WRITE_RB( CP_PACKET0( RADEON_CP_IB_BASE, 2 ));
309	WRITE_RB( cp->buffers.vm_start + buffer_idx * INDIRECT_BUFFER_SIZE * sizeof( uint32 ));
310	WRITE_RB( buffer_size );
311
312	// give buffer a tag so it can be freed after execution
313	WRITE_RB( CP_PACKET0( RADEON_SCRATCH_REG1, 1 ));
314	WRITE_RB( cp->buffers.buffers[buffer_idx].send_tag = (int32)++cp->buffers.cur_tag );
315
316	SHOW_FLOW( 3, "Assigned tag %d", cp->buffers.buffers[buffer_idx].send_tag );
317
318	WRITE_RB_FINISH;
319
320	// append buffer to list of submitted buffers
321	if( cp->buffers.newest > 0 )
322		cp->buffers.buffers[cp->buffers.newest].next = buffer_idx;
323	else
324		cp->buffers.oldest = buffer_idx;
325
326	cp->buffers.newest = buffer_idx;
327	cp->buffers.buffers[buffer_idx].next = -1;
328
329	// flush writes to CP buffers
330	// (this code is a bit of a overkill - currently, only some WinChip/Cyrix
331	//  CPU's support out-of-order writes, but we are prepared)
332	// TODO : Other Architectures? PowerPC?
333	#ifdef __i386__
334	__asm__ __volatile__ ("lock; addl $0,0(%%esp)": : :"memory");
335	#endif
336	// make sure the motherboard chipset has flushed its write buffer by
337	// reading some uncached memory
338	//(void)*(volatile int *)si->framebuffer;
339	INREG( ai->regs, RADEON_CP_RB_RPTR );
340
341	//SHOW_FLOW( 3, "new tail: %d", cp->ring.tail );
342
343	//snooze( 100 );
344
345	// now, the command list should really be written to memory,
346	// so it's safe to instruct the graphics card to read it
347	OUTREG( ai->regs, RADEON_CP_RB_WPTR, cp->ring.tail );
348
349	// read from PCI bus to ensure correct posting
350	//INREG( ai->regs, RADEON_CP_RB_RPTR );
351
352	RELEASE_BEN( cp->lock );
353
354	SHOW_FLOW0( 3, "done" );
355}
356
357
358// mark state buffer as being invalid;
359// this must be done _before_ modifying the state buffer as the
360// state buffer may be in use
361void Radeon_InvalidateStateBuffer( accelerator_info *ai, int state_buffer_idx )
362{
363	CP_info *cp = &ai->si->cp;
364
365	// make sure state buffer is not used anymore
366	Radeon_WaitForIdle( ai, false );
367
368	ACQUIRE_BEN( cp->lock );
369
370	// mark state as being invalid
371	if( cp->buffers.active_state == state_buffer_idx )
372		cp->buffers.active_state = -1;
373
374	RELEASE_BEN( cp->lock );
375}
376
377
378// wait until there is enough space in ring buffer
379// num_dwords - number of dwords needed in ring buffer
380// must be called with benaphore hold
381void Radeon_WaitForRingBufferSpace( accelerator_info *ai, uint num_dwords )
382{
383	bigtime_t start_time;
384	CP_info *cp = &ai->si->cp;
385
386	start_time = system_time();
387
388	while( getAvailRingBuffer( ai ) < num_dwords ) {
389		bigtime_t sample_time;
390
391		sample_time = system_time();
392
393		if( sample_time - start_time > 100000 )
394			break;
395
396		RELEASE_BEN( cp->lock );
397
398		// use exponential fall-off
399		// in the beginning do busy-waiting, later on we let the thread sleep;
400		// the micro-spin is used to reduce PCI load
401		if( sample_time - start_time > 5000 )
402			snooze( (sample_time - start_time) / 10 );
403		else
404			Radeon_Spin( 1 );
405
406		ACQUIRE_BEN( cp->lock );
407	}
408}
409