1/* SPDX-License-Identifier: GPL-2.0 */ 2/* 3 * Copyright (C) 2020 Marvell International Ltd. 4 * 5 * Support functions for managing command queues used for 6 * various hardware blocks. 7 * 8 * The common command queue infrastructure abstracts out the 9 * software necessary for adding to Octeon's chained queue 10 * structures. These structures are used for commands to the 11 * PKO, ZIP, DFA, RAID, HNA, and DMA engine blocks. Although each 12 * hardware unit takes commands and CSRs of different types, 13 * they all use basic linked command buffers to store the 14 * pending request. In general, users of the CVMX API don't 15 * call cvmx-cmd-queue functions directly. Instead the hardware 16 * unit specific wrapper should be used. The wrappers perform 17 * unit specific validation and CSR writes to submit the 18 * commands. 19 * 20 * Even though most software will never directly interact with 21 * cvmx-cmd-queue, knowledge of its internal workings can help 22 * in diagnosing performance problems and help with debugging. 23 * 24 * Command queue pointers are stored in a global named block 25 * called "cvmx_cmd_queues". Except for the PKO queues, each 26 * hardware queue is stored in its own cache line to reduce SMP 27 * contention on spin locks. The PKO queues are stored such that 28 * every 16th queue is next to each other in memory. This scheme 29 * allows for queues being in separate cache lines when there 30 * are low number of queues per port. With 16 queues per port, 31 * the first queue for each port is in the same cache area. The 32 * second queues for each port are in another area, etc. This 33 * allows software to implement very efficient lockless PKO with 34 * 16 queues per port using a minimum of cache lines per core. 35 * All queues for a given core will be isolated in the same 36 * cache area. 37 * 38 * In addition to the memory pointer layout, cvmx-cmd-queue 39 * provides an optimized fair ll/sc locking mechanism for the 40 * queues. The lock uses a "ticket / now serving" model to 41 * maintain fair order on contended locks. In addition, it uses 42 * predicted locking time to limit cache contention. When a core 43 * know it must wait in line for a lock, it spins on the 44 * internal cycle counter to completely eliminate any causes of 45 * bus traffic. 46 */ 47 48#ifndef __CVMX_CMD_QUEUE_H__ 49#define __CVMX_CMD_QUEUE_H__ 50 51/** 52 * By default we disable the max depth support. Most programs 53 * don't use it and it slows down the command queue processing 54 * significantly. 55 */ 56#ifndef CVMX_CMD_QUEUE_ENABLE_MAX_DEPTH 57#define CVMX_CMD_QUEUE_ENABLE_MAX_DEPTH 0 58#endif 59 60/** 61 * Enumeration representing all hardware blocks that use command 62 * queues. Each hardware block has up to 65536 sub identifiers for 63 * multiple command queues. Not all chips support all hardware 64 * units. 65 */ 66typedef enum { 67 CVMX_CMD_QUEUE_PKO_BASE = 0x00000, 68#define CVMX_CMD_QUEUE_PKO(queue) \ 69 ((cvmx_cmd_queue_id_t)(CVMX_CMD_QUEUE_PKO_BASE + (0xffff & (queue)))) 70 CVMX_CMD_QUEUE_ZIP = 0x10000, 71#define CVMX_CMD_QUEUE_ZIP_QUE(queue) \ 72 ((cvmx_cmd_queue_id_t)(CVMX_CMD_QUEUE_ZIP + (0xffff & (queue)))) 73 CVMX_CMD_QUEUE_DFA = 0x20000, 74 CVMX_CMD_QUEUE_RAID = 0x30000, 75 CVMX_CMD_QUEUE_DMA_BASE = 0x40000, 76#define CVMX_CMD_QUEUE_DMA(queue) \ 77 ((cvmx_cmd_queue_id_t)(CVMX_CMD_QUEUE_DMA_BASE + (0xffff & (queue)))) 78 CVMX_CMD_QUEUE_BCH = 0x50000, 79#define CVMX_CMD_QUEUE_BCH(queue) ((cvmx_cmd_queue_id_t)(CVMX_CMD_QUEUE_BCH + (0xffff & (queue)))) 80 CVMX_CMD_QUEUE_HNA = 0x60000, 81 CVMX_CMD_QUEUE_END = 0x70000, 82} cvmx_cmd_queue_id_t; 83 84#define CVMX_CMD_QUEUE_ZIP3_QUE(node, queue) \ 85 ((cvmx_cmd_queue_id_t)((node) << 24 | CVMX_CMD_QUEUE_ZIP | (0xffff & (queue)))) 86 87/** 88 * Command write operations can fail if the command queue needs 89 * a new buffer and the associated FPA pool is empty. It can also 90 * fail if the number of queued command words reaches the maximum 91 * set at initialization. 92 */ 93typedef enum { 94 CVMX_CMD_QUEUE_SUCCESS = 0, 95 CVMX_CMD_QUEUE_NO_MEMORY = -1, 96 CVMX_CMD_QUEUE_FULL = -2, 97 CVMX_CMD_QUEUE_INVALID_PARAM = -3, 98 CVMX_CMD_QUEUE_ALREADY_SETUP = -4, 99} cvmx_cmd_queue_result_t; 100 101typedef struct { 102 /* First 64-bit word: */ 103 u64 fpa_pool : 16; 104 u64 base_paddr : 48; 105 s32 index; 106 u16 max_depth; 107 u16 pool_size_m1; 108} __cvmx_cmd_queue_state_t; 109 110/** 111 * command-queue locking uses a fair ticket spinlock algo, 112 * with 64-bit tickets for endianness-neutrality and 113 * counter overflow protection. 114 * Lock is free when both counters are of equal value. 115 */ 116typedef struct { 117 u64 ticket; 118 u64 now_serving; 119} __cvmx_cmd_queue_lock_t; 120 121/** 122 * @INTERNAL 123 * This structure contains the global state of all command queues. 124 * It is stored in a bootmem named block and shared by all 125 * applications running on Octeon. Tickets are stored in a different 126 * cache line that queue information to reduce the contention on the 127 * ll/sc used to get a ticket. If this is not the case, the update 128 * of queue state causes the ll/sc to fail quite often. 129 */ 130typedef struct { 131 __cvmx_cmd_queue_lock_t lock[(CVMX_CMD_QUEUE_END >> 16) * 256]; 132 __cvmx_cmd_queue_state_t state[(CVMX_CMD_QUEUE_END >> 16) * 256]; 133} __cvmx_cmd_queue_all_state_t; 134 135extern __cvmx_cmd_queue_all_state_t *__cvmx_cmd_queue_state_ptrs[CVMX_MAX_NODES]; 136 137/** 138 * @INTERNAL 139 * Internal function to handle the corner cases 140 * of adding command words to a queue when the current 141 * block is getting full. 142 */ 143cvmx_cmd_queue_result_t __cvmx_cmd_queue_write_raw(cvmx_cmd_queue_id_t queue_id, 144 __cvmx_cmd_queue_state_t *qptr, int cmd_count, 145 const u64 *cmds); 146 147/** 148 * Initialize a command queue for use. The initial FPA buffer is 149 * allocated and the hardware unit is configured to point to the 150 * new command queue. 151 * 152 * @param queue_id Hardware command queue to initialize. 153 * @param max_depth Maximum outstanding commands that can be queued. 154 * @param fpa_pool FPA pool the command queues should come from. 155 * @param pool_size Size of each buffer in the FPA pool (bytes) 156 * 157 * Return: CVMX_CMD_QUEUE_SUCCESS or a failure code 158 */ 159cvmx_cmd_queue_result_t cvmx_cmd_queue_initialize(cvmx_cmd_queue_id_t queue_id, int max_depth, 160 int fpa_pool, int pool_size); 161 162/** 163 * Shutdown a queue a free it's command buffers to the FPA. The 164 * hardware connected to the queue must be stopped before this 165 * function is called. 166 * 167 * @param queue_id Queue to shutdown 168 * 169 * Return: CVMX_CMD_QUEUE_SUCCESS or a failure code 170 */ 171cvmx_cmd_queue_result_t cvmx_cmd_queue_shutdown(cvmx_cmd_queue_id_t queue_id); 172 173/** 174 * Return the number of command words pending in the queue. This 175 * function may be relatively slow for some hardware units. 176 * 177 * @param queue_id Hardware command queue to query 178 * 179 * Return: Number of outstanding commands 180 */ 181int cvmx_cmd_queue_length(cvmx_cmd_queue_id_t queue_id); 182 183/** 184 * Return the command buffer to be written to. The purpose of this 185 * function is to allow CVMX routine access to the low level buffer 186 * for initial hardware setup. User applications should not call this 187 * function directly. 188 * 189 * @param queue_id Command queue to query 190 * 191 * Return: Command buffer or NULL on failure 192 */ 193void *cvmx_cmd_queue_buffer(cvmx_cmd_queue_id_t queue_id); 194 195/** 196 * @INTERNAL 197 * Retrieve or allocate command queue state named block 198 */ 199cvmx_cmd_queue_result_t __cvmx_cmd_queue_init_state_ptr(unsigned int node); 200 201/** 202 * @INTERNAL 203 * Get the index into the state arrays for the supplied queue id. 204 * 205 * @param queue_id Queue ID to get an index for 206 * 207 * Return: Index into the state arrays 208 */ 209static inline unsigned int __cvmx_cmd_queue_get_index(cvmx_cmd_queue_id_t queue_id) 210{ 211 /* Warning: This code currently only works with devices that have 256 212 * queues or less. Devices with more than 16 queues are laid out in 213 * memory to allow cores quick access to every 16th queue. This reduces 214 * cache thrashing when you are running 16 queues per port to support 215 * lockless operation 216 */ 217 unsigned int unit = (queue_id >> 16) & 0xff; 218 unsigned int q = (queue_id >> 4) & 0xf; 219 unsigned int core = queue_id & 0xf; 220 221 return (unit << 8) | (core << 4) | q; 222} 223 224static inline int __cvmx_cmd_queue_get_node(cvmx_cmd_queue_id_t queue_id) 225{ 226 unsigned int node = queue_id >> 24; 227 return node; 228} 229 230/** 231 * @INTERNAL 232 * Lock the supplied queue so nobody else is updating it at the same 233 * time as us. 234 * 235 * @param queue_id Queue ID to lock 236 * 237 */ 238static inline void __cvmx_cmd_queue_lock(cvmx_cmd_queue_id_t queue_id) 239{ 240} 241 242/** 243 * @INTERNAL 244 * Unlock the queue, flushing all writes. 245 * 246 * @param queue_id Queue ID to lock 247 * 248 */ 249static inline void __cvmx_cmd_queue_unlock(cvmx_cmd_queue_id_t queue_id) 250{ 251 CVMX_SYNCWS; /* nudge out the unlock. */ 252} 253 254/** 255 * @INTERNAL 256 * Initialize a command-queue lock to "unlocked" state. 257 */ 258static inline void __cvmx_cmd_queue_lock_init(cvmx_cmd_queue_id_t queue_id) 259{ 260 unsigned int index = __cvmx_cmd_queue_get_index(queue_id); 261 unsigned int node = __cvmx_cmd_queue_get_node(queue_id); 262 263 __cvmx_cmd_queue_state_ptrs[node]->lock[index] = (__cvmx_cmd_queue_lock_t){ 0, 0 }; 264 CVMX_SYNCWS; 265} 266 267/** 268 * @INTERNAL 269 * Get the queue state structure for the given queue id 270 * 271 * @param queue_id Queue id to get 272 * 273 * Return: Queue structure or NULL on failure 274 */ 275static inline __cvmx_cmd_queue_state_t *__cvmx_cmd_queue_get_state(cvmx_cmd_queue_id_t queue_id) 276{ 277 unsigned int index; 278 unsigned int node; 279 __cvmx_cmd_queue_state_t *qptr; 280 281 node = __cvmx_cmd_queue_get_node(queue_id); 282 index = __cvmx_cmd_queue_get_index(queue_id); 283 284 if (cvmx_unlikely(!__cvmx_cmd_queue_state_ptrs[node])) 285 __cvmx_cmd_queue_init_state_ptr(node); 286 287 qptr = &__cvmx_cmd_queue_state_ptrs[node]->state[index]; 288 return qptr; 289} 290 291/** 292 * Write an arbitrary number of command words to a command queue. 293 * This is a generic function; the fixed number of command word 294 * functions yield higher performance. 295 * 296 * @param queue_id Hardware command queue to write to 297 * @param use_locking 298 * Use internal locking to ensure exclusive access for queue 299 * updates. If you don't use this locking you must ensure 300 * exclusivity some other way. Locking is strongly recommended. 301 * @param cmd_count Number of command words to write 302 * @param cmds Array of commands to write 303 * 304 * Return: CVMX_CMD_QUEUE_SUCCESS or a failure code 305 */ 306static inline cvmx_cmd_queue_result_t 307cvmx_cmd_queue_write(cvmx_cmd_queue_id_t queue_id, bool use_locking, int cmd_count, const u64 *cmds) 308{ 309 cvmx_cmd_queue_result_t ret = CVMX_CMD_QUEUE_SUCCESS; 310 u64 *cmd_ptr; 311 312 __cvmx_cmd_queue_state_t *qptr = __cvmx_cmd_queue_get_state(queue_id); 313 314 /* Make sure nobody else is updating the same queue */ 315 if (cvmx_likely(use_locking)) 316 __cvmx_cmd_queue_lock(queue_id); 317 318 /* Most of the time there is lots of free words in current block */ 319 if (cvmx_unlikely((qptr->index + cmd_count) >= qptr->pool_size_m1)) { 320 /* The rare case when nearing end of block */ 321 ret = __cvmx_cmd_queue_write_raw(queue_id, qptr, cmd_count, cmds); 322 } else { 323 cmd_ptr = (u64 *)cvmx_phys_to_ptr((u64)qptr->base_paddr); 324 /* Loop easy for compiler to unroll for the likely case */ 325 while (cmd_count > 0) { 326 cmd_ptr[qptr->index++] = *cmds++; 327 cmd_count--; 328 } 329 } 330 331 /* All updates are complete. Release the lock and return */ 332 if (cvmx_likely(use_locking)) 333 __cvmx_cmd_queue_unlock(queue_id); 334 else 335 CVMX_SYNCWS; 336 337 return ret; 338} 339 340/** 341 * Simple function to write two command words to a command queue. 342 * 343 * @param queue_id Hardware command queue to write to 344 * @param use_locking 345 * Use internal locking to ensure exclusive access for queue 346 * updates. If you don't use this locking you must ensure 347 * exclusivity some other way. Locking is strongly recommended. 348 * @param cmd1 Command 349 * @param cmd2 Command 350 * 351 * Return: CVMX_CMD_QUEUE_SUCCESS or a failure code 352 */ 353static inline cvmx_cmd_queue_result_t cvmx_cmd_queue_write2(cvmx_cmd_queue_id_t queue_id, 354 bool use_locking, u64 cmd1, u64 cmd2) 355{ 356 cvmx_cmd_queue_result_t ret = CVMX_CMD_QUEUE_SUCCESS; 357 u64 *cmd_ptr; 358 359 __cvmx_cmd_queue_state_t *qptr = __cvmx_cmd_queue_get_state(queue_id); 360 361 /* Make sure nobody else is updating the same queue */ 362 if (cvmx_likely(use_locking)) 363 __cvmx_cmd_queue_lock(queue_id); 364 365 if (cvmx_unlikely((qptr->index + 2) >= qptr->pool_size_m1)) { 366 /* The rare case when nearing end of block */ 367 u64 cmds[2]; 368 369 cmds[0] = cmd1; 370 cmds[1] = cmd2; 371 ret = __cvmx_cmd_queue_write_raw(queue_id, qptr, 2, cmds); 372 } else { 373 /* Likely case to work fast */ 374 cmd_ptr = (u64 *)cvmx_phys_to_ptr((u64)qptr->base_paddr); 375 cmd_ptr += qptr->index; 376 qptr->index += 2; 377 cmd_ptr[0] = cmd1; 378 cmd_ptr[1] = cmd2; 379 } 380 381 /* All updates are complete. Release the lock and return */ 382 if (cvmx_likely(use_locking)) 383 __cvmx_cmd_queue_unlock(queue_id); 384 else 385 CVMX_SYNCWS; 386 387 return ret; 388} 389 390/** 391 * Simple function to write three command words to a command queue. 392 * 393 * @param queue_id Hardware command queue to write to 394 * @param use_locking 395 * Use internal locking to ensure exclusive access for queue 396 * updates. If you don't use this locking you must ensure 397 * exclusivity some other way. Locking is strongly recommended. 398 * @param cmd1 Command 399 * @param cmd2 Command 400 * @param cmd3 Command 401 * 402 * Return: CVMX_CMD_QUEUE_SUCCESS or a failure code 403 */ 404static inline cvmx_cmd_queue_result_t 405cvmx_cmd_queue_write3(cvmx_cmd_queue_id_t queue_id, bool use_locking, u64 cmd1, u64 cmd2, u64 cmd3) 406{ 407 cvmx_cmd_queue_result_t ret = CVMX_CMD_QUEUE_SUCCESS; 408 __cvmx_cmd_queue_state_t *qptr = __cvmx_cmd_queue_get_state(queue_id); 409 u64 *cmd_ptr; 410 411 /* Make sure nobody else is updating the same queue */ 412 if (cvmx_likely(use_locking)) 413 __cvmx_cmd_queue_lock(queue_id); 414 415 if (cvmx_unlikely((qptr->index + 3) >= qptr->pool_size_m1)) { 416 /* Most of the time there is lots of free words in current block */ 417 u64 cmds[3]; 418 419 cmds[0] = cmd1; 420 cmds[1] = cmd2; 421 cmds[2] = cmd3; 422 ret = __cvmx_cmd_queue_write_raw(queue_id, qptr, 3, cmds); 423 } else { 424 cmd_ptr = (u64 *)cvmx_phys_to_ptr((u64)qptr->base_paddr); 425 cmd_ptr += qptr->index; 426 qptr->index += 3; 427 cmd_ptr[0] = cmd1; 428 cmd_ptr[1] = cmd2; 429 cmd_ptr[2] = cmd3; 430 } 431 432 /* All updates are complete. Release the lock and return */ 433 if (cvmx_likely(use_locking)) 434 __cvmx_cmd_queue_unlock(queue_id); 435 else 436 CVMX_SYNCWS; 437 438 return ret; 439} 440 441#endif /* __CVMX_CMD_QUEUE_H__ */ 442