/* Copyright (c) 2002, Thomas Kurschel Part of Radeon accelerant Command Processor handling Something about synchronization in general: The DDK says that only some register accesses are stored in the Command FIFO, i.e. in almost all cases you don't have to wait until there is enough space in this FIFO. Unfortunately, ATI doesn't speak clearly here and doesn't tell you which registers are buffered and which not (the r300 DDK provides some examples only, other DDKs refer to some include file where no such info could be found). Looking at pre-Radeon specs, we have the following register ranges: 0 configuration/display/multi-media registers 0xf00 read-only PCI configuration space 0x1000 CCE registers 0x1400 FIFOed GUI-registers So, if the list is still correct, the affected registers are only those used for 2D/3D drawing. This is very important as if the register you want to write is buffered, you have to do a busy wait until there is enough FIFO space. As concurrent threads may do the same, register access should only be done with a lock held. We never write GUI-registers directly, so we never have to wait for the FIFO and thus don't need this lock. */ #include "radeon_accelerant.h" #include "mmio.h" #include "buscntrl_regs.h" #include "utils.h" #include #include "CP.h" #include "log_coll.h" #include "log_enum.h" #include // get number of free entries in CP's ring buffer static uint getAvailRingBuffer( accelerator_info *ai ) { CP_info *cp = &ai->si->cp; int space; space = *(uint32 *)(ai->mapped_memory[cp->feedback.mem_type].data + cp->feedback.head_mem_offset) //*cp->ring.head - cp->ring.tail; //space = INREG( ai->regs, RADEON_CP_RB_RPTR ) - cp->ring.tail; if( space <= 0 ) space += cp->ring.size; // don't fill up the entire buffer as we cannot // distinguish between a full and an empty ring --space; SHOW_FLOW( 3, "head=%ld, tail=%ld, space=%ld", *(uint32 *)(ai->mapped_memory[cp->feedback.mem_type].data + cp->feedback.head_mem_offset), //*cp->ring.head, cp->ring.tail, space ); LOG1( si->log, _GetAvailRingBufferQueue, space ); cp->ring.space = space; return space; } // mark all indirect buffers that have been processed as being free; // lock must be hold void Radeon_FreeIndirectBuffers( accelerator_info *ai ) { CP_info *cp = &ai->si->cp; int32 cur_processed_tag = ((uint32 *)(ai->mapped_memory[cp->feedback.mem_type].data + cp->feedback.scratch_mem_offset))[1]; //ai->si->cp.scratch.ptr[1]; //INREG( ai->regs, RADEON_SCRATCH_REG1 ); SHOW_FLOW( 3, "processed_tag=%d", cur_processed_tag ); // mark all sent indirect buffers as free while( cp->buffers.oldest != -1 ) { indirect_buffer *oldest_buffer = &cp->buffers.buffers[cp->buffers.oldest]; int tmp_oldest_buffer; SHOW_FLOW( 3, "oldset buffer's tag: %d", oldest_buffer->send_tag ); // this is a tricky calculation to handle wrap-arounds correctly, // so don't change it unless you really understand the signess problem if( (int32)(cur_processed_tag - oldest_buffer->send_tag) < 0 ) break; SHOW_FLOW( 3, "mark %d as being free", oldest_buffer->send_tag ); // remove buffer from "used" list tmp_oldest_buffer = oldest_buffer->next; if( tmp_oldest_buffer == -1 ) cp->buffers.newest = -1; // put it on free list oldest_buffer->next = cp->buffers.free_list; cp->buffers.free_list = cp->buffers.oldest; cp->buffers.oldest = tmp_oldest_buffer; } } // wait until an indirect buffer becomes available; // lock must be hold static void Radeon_WaitForFreeIndirectBuffers( accelerator_info *ai ) { bigtime_t start_time; CP_info *cp = &ai->si->cp; SHOW_FLOW0( 3, "" ); start_time = system_time(); while( 1 ) { bigtime_t sample_time; Radeon_FreeIndirectBuffers( ai ); if( cp->buffers.free_list >= 0 ) return; sample_time = system_time(); if( sample_time - start_time > 100000 ) break; RELEASE_BEN( cp->lock ); // use exponential fall-off // in the beginning do busy-waiting, later on we let the thread sleep; // the micro-spin is used to reduce PCI load if( sample_time - start_time > 5000 ) snooze( (sample_time - start_time) / 10 ); else Radeon_Spin( 1 ); ACQUIRE_BEN( cp->lock ); } SHOW_ERROR0( 0, "All buffers are in use and engine doesn't finish any of them" ); // lock must be released during reset (reset acquires it automatically) RELEASE_BEN( cp->lock ); Radeon_ResetEngine( ai ); ACQUIRE_BEN( cp->lock ); } // allocate an indirect buffer int Radeon_AllocIndirectBuffer( accelerator_info *ai, bool keep_lock ) { CP_info *cp = &ai->si->cp; int buffer_idx; SHOW_FLOW0( 3, "" ); ACQUIRE_BEN( cp->lock ); if( cp->buffers.free_list == -1 ) Radeon_WaitForFreeIndirectBuffers( ai ); buffer_idx = cp->buffers.free_list; cp->buffers.free_list = cp->buffers.buffers[buffer_idx].next; //if( !keep_lock ) RELEASE_BEN( cp->lock ); (void)keep_lock; SHOW_FLOW( 3, "got %d", buffer_idx ); return buffer_idx; } // explicitely free an indirect buffer; // this is not needed if the buffer was send via SendIndirectBuffer() // never_used - set to true if the buffer wasn't even sent indirectly // as a state buffer // !Warning! // if never_used is false, execution may take very long as all buffers // must be flushed! void Radeon_FreeIndirectBuffer( accelerator_info *ai, int buffer_idx, bool never_used ) { CP_info *cp = &ai->si->cp; SHOW_FLOW( 3, "buffer_idx=%d, never_used=%d", buffer_idx, never_used ); // if the buffer was used as a state buffer, we don't record its usage, // so we don't know if the buffer was/is/will be used; // the only way to be sure is to let the CP run dry if( !never_used ) Radeon_WaitForIdle( ai, false ); ACQUIRE_BEN( cp->lock ); cp->buffers.buffers[buffer_idx].next = cp->buffers.free_list; cp->buffers.free_list = buffer_idx; RELEASE_BEN( cp->lock ); SHOW_FLOW0( 3, "done" ); } // this function must be moved to end of file to avoid inlining void Radeon_WaitForRingBufferSpace( accelerator_info *ai, uint num_dwords ); // start writing to ring buffer // num_dwords - number of dwords to write (must be precise!) // !Warning! // during wait, CP's benaphore is released #define WRITE_RB_START( num_dwords ) \ { \ uint32 *ring_start; \ uint32 ring_tail, ring_tail_mask; \ uint32 ring_tail_increment = (num_dwords); \ if( cp->ring.space < ring_tail_increment ) \ Radeon_WaitForRingBufferSpace( ai, ring_tail_increment ); \ ring_start = \ (uint32 *)(ai->mapped_memory[cp->ring.mem_type].data + cp->ring.mem_offset); \ /*cp->ring.start;*/ \ ring_tail = cp->ring.tail; \ ring_tail_mask = cp->ring.tail_mask; // write single dword to ring buffer #define WRITE_RB( value ) \ { \ uint32 val = (value); \ SHOW_FLOW( 3, "@%d: %x", ring_tail, val ); \ ring_start[ring_tail++] = val; \ ring_tail &= ring_tail_mask; \ } // finish writing to ring buffer #define WRITE_RB_FINISH \ cp->ring.tail = ring_tail; \ cp->ring.space -= ring_tail_increment; \ } // submit indirect buffer for execution. // the indirect buffer must not be used afterwards! // buffer_idx - index of indirect buffer to submit // buffer_size - size of indirect buffer in 32 bits // state_buffer_idx - index of indirect buffer to restore required state // state_buffer_size - size of indirect buffer to restore required state // returns: tag of buffer (so you can wait for its execution) // if no special state is required, set state_buffer_size to zero void Radeon_SendIndirectBuffer( accelerator_info *ai, int buffer_idx, int buffer_size, int state_buffer_idx, int state_buffer_size, bool has_lock ) { CP_info *cp = &ai->si->cp; bool need_stateupdate; SHOW_FLOW( 3, "buffer_idx=%d, buffer_size=%d, state_buffer_idx=%d, state_buffer_size=%d", buffer_idx, buffer_size, state_buffer_idx, state_buffer_size ); if( (buffer_size & 1) != 0 ) { SHOW_FLOW( 3, "buffer has uneven size (%d)", buffer_size ); // size of indirect buffers _must_ be multiple of 64 bits, so // add a nop to fulfil alignment Radeon_GetIndirectBufferPtr( ai, buffer_idx )[buffer_size] = RADEON_CP_PACKET2; buffer_size += 1; } //if( !has_lock ) ACQUIRE_BEN( cp->lock ); (void)has_lock; need_stateupdate = state_buffer_size > 0 && state_buffer_idx != cp->buffers.active_state; WRITE_RB_START( 5 + (need_stateupdate ? 3 : 0) ); // if the indirect buffer to submit requires a special state and the // hardware is in wrong state then execute state buffer if( need_stateupdate ) { SHOW_FLOW0( 3, "update state" ); WRITE_RB( CP_PACKET0( RADEON_CP_IB_BASE, 2 )); WRITE_RB( cp->buffers.vm_start + state_buffer_idx * INDIRECT_BUFFER_SIZE * sizeof( uint32 )); WRITE_RB( state_buffer_size ); cp->buffers.active_state = state_buffer_idx; } // execute indirect buffer WRITE_RB( CP_PACKET0( RADEON_CP_IB_BASE, 2 )); WRITE_RB( cp->buffers.vm_start + buffer_idx * INDIRECT_BUFFER_SIZE * sizeof( uint32 )); WRITE_RB( buffer_size ); // give buffer a tag so it can be freed after execution WRITE_RB( CP_PACKET0( RADEON_SCRATCH_REG1, 1 )); WRITE_RB( cp->buffers.buffers[buffer_idx].send_tag = (int32)++cp->buffers.cur_tag ); SHOW_FLOW( 3, "Assigned tag %d", cp->buffers.buffers[buffer_idx].send_tag ); WRITE_RB_FINISH; // append buffer to list of submitted buffers if( cp->buffers.newest > 0 ) cp->buffers.buffers[cp->buffers.newest].next = buffer_idx; else cp->buffers.oldest = buffer_idx; cp->buffers.newest = buffer_idx; cp->buffers.buffers[buffer_idx].next = -1; // flush writes to CP buffers // (this code is a bit of a overkill - currently, only some WinChip/Cyrix // CPU's support out-of-order writes, but we are prepared) // TODO : Other Architectures? PowerPC? #ifdef __i386__ __asm__ __volatile__ ("lock; addl $0,0(%%esp)": : :"memory"); #endif // make sure the motherboard chipset has flushed its write buffer by // reading some uncached memory //(void)*(volatile int *)si->framebuffer; INREG( ai->regs, RADEON_CP_RB_RPTR ); //SHOW_FLOW( 3, "new tail: %d", cp->ring.tail ); //snooze( 100 ); // now, the command list should really be written to memory, // so it's safe to instruct the graphics card to read it OUTREG( ai->regs, RADEON_CP_RB_WPTR, cp->ring.tail ); // read from PCI bus to ensure correct posting //INREG( ai->regs, RADEON_CP_RB_RPTR ); RELEASE_BEN( cp->lock ); SHOW_FLOW0( 3, "done" ); } // mark state buffer as being invalid; // this must be done _before_ modifying the state buffer as the // state buffer may be in use void Radeon_InvalidateStateBuffer( accelerator_info *ai, int state_buffer_idx ) { CP_info *cp = &ai->si->cp; // make sure state buffer is not used anymore Radeon_WaitForIdle( ai, false ); ACQUIRE_BEN( cp->lock ); // mark state as being invalid if( cp->buffers.active_state == state_buffer_idx ) cp->buffers.active_state = -1; RELEASE_BEN( cp->lock ); } // wait until there is enough space in ring buffer // num_dwords - number of dwords needed in ring buffer // must be called with benaphore hold void Radeon_WaitForRingBufferSpace( accelerator_info *ai, uint num_dwords ) { bigtime_t start_time; CP_info *cp = &ai->si->cp; start_time = system_time(); while( getAvailRingBuffer( ai ) < num_dwords ) { bigtime_t sample_time; sample_time = system_time(); if( sample_time - start_time > 100000 ) break; RELEASE_BEN( cp->lock ); // use exponential fall-off // in the beginning do busy-waiting, later on we let the thread sleep; // the micro-spin is used to reduce PCI load if( sample_time - start_time > 5000 ) snooze( (sample_time - start_time) / 10 ); else Radeon_Spin( 1 ); ACQUIRE_BEN( cp->lock ); } }