dev.c revision 331769
1/*
2 * Copyright (c) 2006-2016 Chelsio, Inc. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses.  You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 *     Redistribution and use in source and binary forms, with or
11 *     without modification, are permitted provided that the following
12 *     conditions are met:
13 *
14 *      - Redistributions of source code must retain the above
15 *        copyright notice, this list of conditions and the following
16 *        disclaimer.
17 *
18 *      - Redistributions in binary form must reproduce the above
19 *        copyright notice, this list of conditions and the following
20 *        disclaimer in the documentation and/or other materials
21 *        provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 */
32#include <config.h>
33
34#include <stdio.h>
35#include <stdlib.h>
36#include <unistd.h>
37#include <errno.h>
38#include <sys/mman.h>
39#include <pthread.h>
40#include <string.h>
41#include <signal.h>
42#include <stdbool.h>
43
44#include "libcxgb4.h"
45#include "cxgb4-abi.h"
46
47#define PCI_VENDOR_ID_CHELSIO		0x1425
48
49/*
50 * Macros needed to support the PCI Device ID Table ...
51 */
52#define CH_PCI_DEVICE_ID_TABLE_DEFINE_BEGIN \
53	static struct { \
54		unsigned vendor; \
55		unsigned device; \
56	} hca_table[] = {
57
58#define CH_PCI_DEVICE_ID_FUNCTION \
59		0x4
60
61#define CH_PCI_ID_TABLE_ENTRY(__DeviceID) \
62		{ \
63			.vendor = PCI_VENDOR_ID_CHELSIO, \
64			.device = (__DeviceID), \
65		}
66
67#define CH_PCI_DEVICE_ID_TABLE_DEFINE_END \
68	}
69
70#include "t4_chip_type.h"
71#include "t4_pci_id_tbl.h"
72
73unsigned long c4iw_page_size;
74unsigned long c4iw_page_shift;
75unsigned long c4iw_page_mask;
76int ma_wr;
77int t5_en_wc = 1;
78
79static TAILQ_HEAD(,c4iw_dev) devices = TAILQ_HEAD_INITIALIZER(devices);
80
81static struct ibv_context_ops c4iw_ctx_ops = {
82	.query_device = c4iw_query_device,
83	.query_port = c4iw_query_port,
84	.alloc_pd = c4iw_alloc_pd,
85	.dealloc_pd = c4iw_free_pd,
86	.reg_mr = c4iw_reg_mr,
87	.dereg_mr = c4iw_dereg_mr,
88	.create_cq = c4iw_create_cq,
89	.resize_cq = c4iw_resize_cq,
90	.destroy_cq = c4iw_destroy_cq,
91	.create_srq = c4iw_create_srq,
92	.modify_srq = c4iw_modify_srq,
93	.destroy_srq = c4iw_destroy_srq,
94	.create_qp = c4iw_create_qp,
95	.modify_qp = c4iw_modify_qp,
96	.destroy_qp = c4iw_destroy_qp,
97	.query_qp = c4iw_query_qp,
98	.create_ah = c4iw_create_ah,
99	.destroy_ah = c4iw_destroy_ah,
100	.attach_mcast = c4iw_attach_mcast,
101	.detach_mcast = c4iw_detach_mcast,
102	.post_srq_recv = c4iw_post_srq_recv,
103	.req_notify_cq = c4iw_arm_cq,
104};
105
106static struct ibv_context *c4iw_alloc_context(struct ibv_device *ibdev,
107					      int cmd_fd)
108{
109	struct c4iw_context *context;
110	struct ibv_get_context cmd;
111	struct c4iw_alloc_ucontext_resp resp;
112	struct c4iw_dev *rhp = to_c4iw_dev(ibdev);
113	struct ibv_query_device qcmd;
114	uint64_t raw_fw_ver;
115	struct ibv_device_attr attr;
116
117	context = malloc(sizeof *context);
118	if (!context)
119		return NULL;
120
121	memset(context, 0, sizeof *context);
122	context->ibv_ctx.cmd_fd = cmd_fd;
123
124	resp.status_page_size = 0;
125	resp.reserved = 0;
126	if (ibv_cmd_get_context(&context->ibv_ctx, &cmd, sizeof cmd,
127				&resp.ibv_resp, sizeof resp))
128		goto err_free;
129
130	if (resp.reserved)
131		PDBG("%s c4iw_alloc_ucontext_resp reserved field modified by kernel\n",
132		     __FUNCTION__);
133
134	context->status_page_size = resp.status_page_size;
135	if (resp.status_page_size) {
136		context->status_page = mmap(NULL, resp.status_page_size,
137					    PROT_READ, MAP_SHARED, cmd_fd,
138					    resp.status_page_key);
139		if (context->status_page == MAP_FAILED)
140			goto err_free;
141	}
142
143	context->ibv_ctx.device = ibdev;
144	context->ibv_ctx.ops = c4iw_ctx_ops;
145
146	switch (rhp->chip_version) {
147	case CHELSIO_T6:
148		PDBG("%s T6/T5/T4 device\n", __FUNCTION__);
149	case CHELSIO_T5:
150		PDBG("%s T5/T4 device\n", __FUNCTION__);
151	case CHELSIO_T4:
152		PDBG("%s T4 device\n", __FUNCTION__);
153		context->ibv_ctx.ops.async_event = c4iw_async_event;
154		context->ibv_ctx.ops.post_send = c4iw_post_send;
155		context->ibv_ctx.ops.post_recv = c4iw_post_receive;
156		context->ibv_ctx.ops.poll_cq = c4iw_poll_cq;
157		context->ibv_ctx.ops.req_notify_cq = c4iw_arm_cq;
158		break;
159	default:
160		PDBG("%s unknown hca type %d\n", __FUNCTION__,
161		     rhp->chip_version);
162		goto err_unmap;
163		break;
164	}
165
166	if (!rhp->mmid2ptr) {
167		int ret;
168
169		ret = ibv_cmd_query_device(&context->ibv_ctx, &attr, &raw_fw_ver, &qcmd,
170					   sizeof qcmd);
171		if (ret)
172			goto err_unmap;
173		rhp->max_mr = attr.max_mr;
174		rhp->mmid2ptr = calloc(attr.max_mr, sizeof(void *));
175		if (!rhp->mmid2ptr) {
176			goto err_unmap;
177		}
178		if (rhp->abi_version < 3) {
179			fprintf(stderr, "Warning: iw_cxgb4 driver is of older version"
180					" than libcxgb4:: %d\n", rhp->abi_version);
181			rhp->max_qp = T4_QID_BASE + attr.max_qp;
182		} else {
183			rhp->max_qp = context->status_page->qp_start +
184					context->status_page->qp_size;
185		}
186		rhp->qpid2ptr = calloc(rhp->max_qp, sizeof(void *));
187		if (!rhp->qpid2ptr) {
188			goto err_unmap;
189		}
190		if (rhp->abi_version < 3)
191			rhp->max_cq = T4_QID_BASE + attr.max_cq;
192		else
193			rhp->max_cq = context->status_page->cq_start +
194					context->status_page->cq_size;
195		rhp->cqid2ptr = calloc(rhp->max_cq, sizeof(void *));
196		if (!rhp->cqid2ptr)
197			goto err_unmap;
198
199		/* Disable userspace WC if architecture/adapter does not
200		 * support WC.
201		 * Note: To forcefully disable WC in kernel driver use the
202		 * loader tunable "hw.cxl.write_combine=0"
203		 */
204		if (t5_en_wc && !context->status_page->wc_supported) {
205			fprintf(stderr, "iw_cxgb4 driver doesn't support Write "
206				"Combine, so regular DB writes will be used\n");
207			t5_en_wc = 0;
208		}
209	}
210
211	return &context->ibv_ctx;
212
213err_unmap:
214	munmap(context->status_page, context->status_page_size);
215err_free:
216	if (rhp->cqid2ptr)
217		free(rhp->cqid2ptr);
218	if (rhp->qpid2ptr)
219		free(rhp->cqid2ptr);
220	if (rhp->mmid2ptr)
221		free(rhp->cqid2ptr);
222	free(context);
223	return NULL;
224}
225
226static void c4iw_free_context(struct ibv_context *ibctx)
227{
228	struct c4iw_context *context = to_c4iw_context(ibctx);
229
230	if (context->status_page_size)
231		munmap(context->status_page, context->status_page_size);
232	free(context);
233}
234
235static struct verbs_device_ops c4iw_dev_ops = {
236	.alloc_context = c4iw_alloc_context,
237	.free_context = c4iw_free_context
238};
239
240#ifdef STALL_DETECTION
241
242int stall_to;
243
244static void dump_cq(struct c4iw_cq *chp)
245{
246	int i;
247
248	fprintf(stderr,
249 		"CQ: %p id %u queue %p cidx 0x%08x sw_queue %p sw_cidx %d sw_pidx %d sw_in_use %d depth %u error %u gen %d "
250		"cidx_inc %d bits_type_ts %016" PRIx64 " notempty %d\n", chp,
251                chp->cq.cqid, chp->cq.queue, chp->cq.cidx,
252	 	chp->cq.sw_queue, chp->cq.sw_cidx, chp->cq.sw_pidx, chp->cq.sw_in_use,
253                chp->cq.size, chp->cq.error, chp->cq.gen, chp->cq.cidx_inc, be64toh(chp->cq.bits_type_ts),
254		t4_cq_notempty(&chp->cq));
255
256	for (i=0; i < chp->cq.size; i++) {
257		u64 *p = (u64 *)(chp->cq.queue + i);
258
259		fprintf(stderr, "%02x: %016" PRIx64 " %016" PRIx64, i, be64toh(p[0]), be64toh(p[1]));
260		if (i == chp->cq.cidx)
261			fprintf(stderr, " <-- cidx\n");
262		else
263			fprintf(stderr, "\n");
264		p+= 2;
265		fprintf(stderr, "%02x: %016" PRIx64 " %016" PRIx64 "\n", i, be64toh(p[0]), be64toh(p[1]));
266		p+= 2;
267		fprintf(stderr, "%02x: %016" PRIx64 " %016" PRIx64 "\n", i, be64toh(p[0]), be64toh(p[1]));
268		p+= 2;
269		fprintf(stderr, "%02x: %016" PRIx64 " %016" PRIx64 "\n", i, be64toh(p[0]), be64toh(p[1]));
270		p+= 2;
271	}
272}
273
274static void dump_qp(struct c4iw_qp *qhp)
275{
276	int i;
277	int j;
278	struct t4_swsqe *swsqe;
279	struct t4_swrqe *swrqe;
280	u16 cidx, pidx;
281	u64 *p;
282
283	fprintf(stderr,
284		"QP: %p id %u error %d flushed %d qid_mask 0x%x\n"
285		"    SQ: id %u queue %p sw_queue %p cidx %u pidx %u in_use %u wq_pidx %u depth %u flags 0x%x flush_cidx %d\n"
286		"    RQ: id %u queue %p sw_queue %p cidx %u pidx %u in_use %u depth %u\n",
287		qhp,
288		qhp->wq.sq.qid,
289		qhp->wq.error,
290		qhp->wq.flushed,
291		qhp->wq.qid_mask,
292		qhp->wq.sq.qid,
293		qhp->wq.sq.queue,
294		qhp->wq.sq.sw_sq,
295		qhp->wq.sq.cidx,
296		qhp->wq.sq.pidx,
297		qhp->wq.sq.in_use,
298		qhp->wq.sq.wq_pidx,
299		qhp->wq.sq.size,
300		qhp->wq.sq.flags,
301		qhp->wq.sq.flush_cidx,
302		qhp->wq.rq.qid,
303		qhp->wq.rq.queue,
304		qhp->wq.rq.sw_rq,
305		qhp->wq.rq.cidx,
306		qhp->wq.rq.pidx,
307		qhp->wq.rq.in_use,
308		qhp->wq.rq.size);
309	cidx = qhp->wq.sq.cidx;
310	pidx = qhp->wq.sq.pidx;
311	if (cidx != pidx)
312		fprintf(stderr, "SQ: \n");
313	while (cidx != pidx) {
314		swsqe = &qhp->wq.sq.sw_sq[cidx];
315		fprintf(stderr, "%04u: wr_id %016" PRIx64
316			" sq_wptr %08x read_len %u opcode 0x%x "
317			"complete %u signaled %u cqe %016" PRIx64 " %016" PRIx64 " %016" PRIx64 " %016" PRIx64 "\n",
318			cidx,
319			swsqe->wr_id,
320			swsqe->idx,
321			swsqe->read_len,
322			swsqe->opcode,
323			swsqe->complete,
324			swsqe->signaled,
325			htobe64(((uint64_t *)&swsqe->cqe)[0]),
326			htobe64(((uint64_t *)&swsqe->cqe)[1]),
327			htobe64(((uint64_t *)&swsqe->cqe)[2]),
328			htobe64(((uint64_t *)&swsqe->cqe)[3]));
329		if (++cidx == qhp->wq.sq.size)
330			cidx = 0;
331	}
332
333	fprintf(stderr, "SQ WQ: \n");
334	p = (u64 *)qhp->wq.sq.queue;
335	for (i=0; i < qhp->wq.sq.size * T4_SQ_NUM_SLOTS; i++) {
336		for (j=0; j < T4_EQ_ENTRY_SIZE / 16; j++) {
337			fprintf(stderr, "%04u %016" PRIx64 " %016" PRIx64 " ",
338				i, be64toh(p[0]), be64toh(p[1]));
339			if (j == 0 && i == qhp->wq.sq.wq_pidx)
340				fprintf(stderr, " <-- pidx");
341			fprintf(stderr, "\n");
342			p += 2;
343		}
344	}
345	cidx = qhp->wq.rq.cidx;
346	pidx = qhp->wq.rq.pidx;
347	if (cidx != pidx)
348		fprintf(stderr, "RQ: \n");
349	while (cidx != pidx) {
350		swrqe = &qhp->wq.rq.sw_rq[cidx];
351		fprintf(stderr, "%04u: wr_id %016" PRIx64 "\n",
352			cidx,
353			swrqe->wr_id );
354		if (++cidx == qhp->wq.rq.size)
355			cidx = 0;
356	}
357
358	fprintf(stderr, "RQ WQ: \n");
359	p = (u64 *)qhp->wq.rq.queue;
360	for (i=0; i < qhp->wq.rq.size * T4_RQ_NUM_SLOTS; i++) {
361		for (j=0; j < T4_EQ_ENTRY_SIZE / 16; j++) {
362			fprintf(stderr, "%04u %016" PRIx64 " %016" PRIx64 " ",
363				i, be64toh(p[0]), be64toh(p[1]));
364			if (j == 0 && i == qhp->wq.rq.pidx)
365				fprintf(stderr, " <-- pidx");
366			if (j == 0 && i == qhp->wq.rq.cidx)
367				fprintf(stderr, " <-- cidx");
368			fprintf(stderr, "\n");
369			p+=2;
370		}
371	}
372}
373
374void dump_state(void)
375{
376	struct c4iw_dev *dev;
377	int i;
378
379	fprintf(stderr, "STALL DETECTED:\n");
380	TAILQ_FOREACH(dev, &devices, list) {
381		//pthread_spin_lock(&dev->lock);
382		fprintf(stderr, "Device %s\n", dev->ibv_dev.name);
383		for (i=0; i < dev->max_cq; i++) {
384			if (dev->cqid2ptr[i]) {
385				struct c4iw_cq *chp = dev->cqid2ptr[i];
386				//pthread_spin_lock(&chp->lock);
387				dump_cq(chp);
388				//pthread_spin_unlock(&chp->lock);
389			}
390		}
391		for (i=0; i < dev->max_qp; i++) {
392			if (dev->qpid2ptr[i]) {
393				struct c4iw_qp *qhp = dev->qpid2ptr[i];
394				//pthread_spin_lock(&qhp->lock);
395				dump_qp(qhp);
396				//pthread_spin_unlock(&qhp->lock);
397			}
398		}
399		//pthread_spin_unlock(&dev->lock);
400	}
401	fprintf(stderr, "DUMP COMPLETE:\n");
402	fflush(stderr);
403}
404#endif /* end of STALL_DETECTION */
405
406/*
407 * c4iw_abi_version is used to store ABI for iw_cxgb4 so the user mode library
408 * can know if the driver supports the kernel mode db ringing.
409 */
410int c4iw_abi_version = 1;
411
412static struct verbs_device *cxgb4_driver_init(const char *uverbs_sys_path,
413					      int abi_version)
414{
415	char devstr[IBV_SYSFS_PATH_MAX], ibdev[16], value[128], *cp;
416	char dev_str[IBV_SYSFS_PATH_MAX];
417	struct c4iw_dev *dev;
418	unsigned vendor, device, fw_maj, fw_min;
419	int i;
420	char devnum;
421	char ib_param[16];
422
423#ifndef __linux__
424	if (ibv_read_sysfs_file(uverbs_sys_path, "ibdev",
425				ibdev, sizeof ibdev) < 0)
426		return NULL;
427
428	devnum = atoi(&ibdev[5]);
429
430	if (ibdev[0] == 't' && ibdev[1] >= '4' && ibdev[1] <= '6' &&
431	    strstr(&ibdev[2], "nex") && devnum >= 0) {
432		snprintf(dev_str, sizeof(dev_str), "/dev/t%cnex/%d", ibdev[1],
433		    devnum);
434	} else
435		return NULL;
436
437	if (ibv_read_sysfs_file(dev_str, "\%pnpinfo", value, sizeof value) < 0)
438		return NULL;
439	else {
440		if (strstr(value, "vendor=")) {
441			strncpy(ib_param, strstr(value, "vendor=") +
442					strlen("vendor="), 6);
443			sscanf(ib_param, "%i", &vendor);
444		}
445
446		if (strstr(value, "device=")) {
447			strncpy(ib_param, strstr(value, "device=") +
448					strlen("device="), 6);
449			sscanf(ib_param, "%i", &device);
450		}
451	}
452#else
453	if (ibv_read_sysfs_file(uverbs_sys_path, "device/vendor",
454				value, sizeof value) < 0)
455		return NULL;
456	sscanf(value, "%i", &vendor);
457
458	if (ibv_read_sysfs_file(uverbs_sys_path, "device/device",
459				value, sizeof value) < 0)
460		return NULL;
461	sscanf(value, "%i", &device);
462#endif
463
464	for (i = 0; i < sizeof hca_table / sizeof hca_table[0]; ++i)
465		if (vendor == hca_table[i].vendor &&
466		    device == hca_table[i].device)
467			goto found;
468
469	return NULL;
470
471found:
472	c4iw_abi_version = abi_version;
473
474#ifndef __linux__
475	if (ibv_read_sysfs_file(dev_str, "firmware_version",
476				value, sizeof value) < 0)
477		return NULL;
478#else
479	/*
480	 * Verify that the firmware major number matches.  Major number
481	 * mismatches are fatal.  Minor number mismatches are tolerated.
482	 */
483	if (ibv_read_sysfs_file(uverbs_sys_path, "ibdev",
484				ibdev, sizeof ibdev) < 0)
485		return NULL;
486
487	memset(devstr, 0, sizeof devstr);
488	snprintf(devstr, sizeof devstr, "%s/class/infiniband/%s",
489		 ibv_get_sysfs_path(), ibdev);
490	if (ibv_read_sysfs_file(devstr, "fw_ver", value, sizeof value) < 0)
491		return NULL;
492#endif
493
494	cp = strtok(value+1, ".");
495	sscanf(cp, "%i", &fw_maj);
496	cp = strtok(NULL, ".");
497	sscanf(cp, "%i", &fw_min);
498
499	if ((signed int)fw_maj < FW_MAJ) {
500		fprintf(stderr, "libcxgb4: Fatal firmware version mismatch.  "
501			"Firmware major number is %u and libcxgb4 needs %u.\n",
502			fw_maj, FW_MAJ);
503		fflush(stderr);
504		return NULL;
505	}
506
507	DBGLOG("libcxgb4");
508
509	if ((signed int)fw_min < FW_MIN) {
510		PDBG("libcxgb4: non-fatal firmware version mismatch.  "
511			"Firmware minor number is %u and libcxgb4 needs %u.\n",
512			fw_min, FW_MIN);
513		fflush(stderr);
514	}
515
516	PDBG("%s found vendor %d device %d type %d\n",
517	     __FUNCTION__, vendor, device, CHELSIO_CHIP_VERSION(hca_table[i].device >> 8));
518
519	dev = calloc(1, sizeof *dev);
520	if (!dev) {
521		return NULL;
522	}
523
524	pthread_spin_init(&dev->lock, PTHREAD_PROCESS_PRIVATE);
525	dev->ibv_dev.ops = &c4iw_dev_ops;
526	dev->chip_version = CHELSIO_CHIP_VERSION(hca_table[i].device >> 8);
527	dev->abi_version = abi_version;
528
529	PDBG("%s device claimed\n", __FUNCTION__);
530	TAILQ_INSERT_TAIL(&devices, dev, list);
531#ifdef STALL_DETECTION
532{
533	char *c = getenv("CXGB4_STALL_TIMEOUT");
534	if (c) {
535		stall_to = strtol(c, NULL, 0);
536		if (errno || stall_to < 0)
537			stall_to = 0;
538	}
539}
540#endif
541{
542	char *c = getenv("CXGB4_MA_WR");
543	if (c) {
544		ma_wr = strtol(c, NULL, 0);
545		if (ma_wr != 1)
546			ma_wr = 0;
547	}
548}
549{
550	char *c = getenv("T5_ENABLE_WC");
551	if (c) {
552		t5_en_wc = strtol(c, NULL, 0);
553		if (t5_en_wc != 1)
554			t5_en_wc = 0;
555	}
556}
557
558	return &dev->ibv_dev;
559}
560
561static __attribute__((constructor)) void cxgb4_register_driver(void)
562{
563	c4iw_page_size = sysconf(_SC_PAGESIZE);
564	c4iw_page_shift = long_log2(c4iw_page_size);
565	c4iw_page_mask = ~(c4iw_page_size - 1);
566	verbs_register_driver("cxgb4", cxgb4_driver_init);
567}
568
569#ifdef STATS
570void __attribute__ ((destructor)) cs_fini(void);
571void  __attribute__ ((destructor)) cs_fini(void)
572{
573	syslog(LOG_NOTICE, "cxgb4 stats - sends %lu recv %lu read %lu "
574	       "write %lu arm %lu cqe %lu mr %lu qp %lu cq %lu\n",
575	       c4iw_stats.send, c4iw_stats.recv, c4iw_stats.read,
576	       c4iw_stats.write, c4iw_stats.arm, c4iw_stats.cqe,
577	       c4iw_stats.mr, c4iw_stats.qp, c4iw_stats.cq);
578}
579#endif
580