dev.c revision 309378
1/*
2 * Copyright (c) 2006-2014 Chelsio, Inc. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses.  You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 *     Redistribution and use in source and binary forms, with or
11 *     without modification, are permitted provided that the following
12 *     conditions are met:
13 *
14 *      - Redistributions of source code must retain the above
15 *        copyright notice, this list of conditions and the following
16 *        disclaimer.
17 *
18 *      - Redistributions in binary form must reproduce the above
19 *        copyright notice, this list of conditions and the following
20 *        disclaimer in the documentation and/or other materials
21 *        provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 */
32#if HAVE_CONFIG_H
33#  include <config.h>
34#endif				/* HAVE_CONFIG_H */
35
36#include <stdio.h>
37#include <stdlib.h>
38#include <unistd.h>
39#include <errno.h>
40#include <sys/mman.h>
41#include <pthread.h>
42#include <string.h>
43#include <signal.h>
44
45#include "libcxgb4.h"
46#include "cxgb4-abi.h"
47
48#define PCI_VENDOR_ID_CHELSIO		0x1425
49
50/*
51 * Macros needed to support the PCI Device ID Table ...
52 */
53#define CH_PCI_DEVICE_ID_TABLE_DEFINE_BEGIN \
54	struct { \
55		unsigned vendor; \
56		unsigned device; \
57	} hca_table[] = {
58
59#define CH_PCI_DEVICE_ID_FUNCTION \
60		0x4
61
62#define CH_PCI_ID_TABLE_ENTRY(__DeviceID) \
63		{ \
64			.vendor = PCI_VENDOR_ID_CHELSIO, \
65			.device = (__DeviceID), \
66		}
67
68#define CH_PCI_DEVICE_ID_TABLE_DEFINE_END \
69	}
70
71#include "t4_chip_type.h"
72#include "t4_pci_id_tbl.h"
73
74unsigned long c4iw_page_size;
75unsigned long c4iw_page_shift;
76unsigned long c4iw_page_mask;
77int ma_wr;
78int t5_en_wc = 1;
79
80SLIST_HEAD(devices_struct, c4iw_dev) devices;
81
82static struct ibv_context_ops c4iw_ctx_ops = {
83	.query_device = c4iw_query_device,
84	.query_port = c4iw_query_port,
85	.alloc_pd = c4iw_alloc_pd,
86	.dealloc_pd = c4iw_free_pd,
87	.reg_mr = c4iw_reg_mr,
88	.dereg_mr = c4iw_dereg_mr,
89	.create_cq = c4iw_create_cq,
90	.resize_cq = c4iw_resize_cq,
91	.destroy_cq = c4iw_destroy_cq,
92	.create_srq = c4iw_create_srq,
93	.modify_srq = c4iw_modify_srq,
94	.destroy_srq = c4iw_destroy_srq,
95	.create_qp = c4iw_create_qp,
96	.modify_qp = c4iw_modify_qp,
97	.destroy_qp = c4iw_destroy_qp,
98	.query_qp = c4iw_query_qp,
99	.create_ah = c4iw_create_ah,
100	.destroy_ah = c4iw_destroy_ah,
101	.attach_mcast = c4iw_attach_mcast,
102	.detach_mcast = c4iw_detach_mcast,
103	.post_srq_recv = c4iw_post_srq_recv,
104	.req_notify_cq = c4iw_arm_cq,
105};
106
107static struct ibv_context *c4iw_alloc_context(struct ibv_device *ibdev,
108					      int cmd_fd)
109{
110	struct c4iw_context *context;
111	struct ibv_get_context cmd;
112	struct c4iw_alloc_ucontext_resp resp;
113	struct c4iw_dev *rhp = to_c4iw_dev(ibdev);
114	struct ibv_query_device qcmd;
115	uint64_t raw_fw_ver;
116	struct ibv_device_attr attr;
117
118	context = malloc(sizeof *context);
119	if (!context)
120		return NULL;
121
122	memset(context, 0, sizeof *context);
123	context->ibv_ctx.cmd_fd = cmd_fd;
124
125	resp.status_page_size = 0;
126	resp.reserved = 0;
127	if (ibv_cmd_get_context(&context->ibv_ctx, &cmd, sizeof cmd,
128				&resp.ibv_resp, sizeof resp))
129		goto err_free;
130
131	if (resp.reserved)
132		PDBG("%s c4iw_alloc_ucontext_resp reserved field modified by kernel\n",
133		     __FUNCTION__);
134
135	context->status_page_size = resp.status_page_size;
136	if (resp.status_page_size) {
137		context->status_page = mmap(NULL, resp.status_page_size,
138					    PROT_READ, MAP_SHARED, cmd_fd,
139					    resp.status_page_key);
140		if (context->status_page == MAP_FAILED)
141			goto err_free;
142	}
143
144	context->ibv_ctx.device = ibdev;
145	context->ibv_ctx.ops = c4iw_ctx_ops;
146
147	switch (rhp->chip_version) {
148	case CHELSIO_T5:
149		PDBG("%s T5/T4 device\n", __FUNCTION__);
150	case CHELSIO_T4:
151		PDBG("%s T4 device\n", __FUNCTION__);
152		context->ibv_ctx.ops.async_event = c4iw_async_event;
153		context->ibv_ctx.ops.post_send = c4iw_post_send;
154		context->ibv_ctx.ops.post_recv = c4iw_post_receive;
155		context->ibv_ctx.ops.poll_cq = c4iw_poll_cq;
156		context->ibv_ctx.ops.req_notify_cq = c4iw_arm_cq;
157		break;
158	default:
159		PDBG("%s unknown hca type %d\n", __FUNCTION__,
160		     rhp->chip_version);
161		goto err_unmap;
162		break;
163	}
164
165	if (!rhp->mmid2ptr) {
166		int ret;
167
168		ret = ibv_cmd_query_device(&context->ibv_ctx, &attr, &raw_fw_ver, &qcmd,
169					   sizeof qcmd);
170		if (ret)
171			goto err_unmap;
172		rhp->max_mr = attr.max_mr;
173		rhp->mmid2ptr = calloc(attr.max_mr, sizeof(void *));
174		if (!rhp->mmid2ptr) {
175			goto err_unmap;
176		}
177		rhp->max_qp = T4_QID_BASE + attr.max_cq;
178		rhp->qpid2ptr = calloc(T4_QID_BASE + attr.max_cq, sizeof(void *));
179		if (!rhp->qpid2ptr) {
180			goto err_unmap;
181		}
182		rhp->max_cq = T4_QID_BASE + attr.max_cq;
183		rhp->cqid2ptr = calloc(T4_QID_BASE + attr.max_cq, sizeof(void *));
184		if (!rhp->cqid2ptr)
185			goto err_unmap;
186	}
187
188	return &context->ibv_ctx;
189
190err_unmap:
191	munmap(context->status_page, context->status_page_size);
192err_free:
193	if (rhp->cqid2ptr)
194		free(rhp->cqid2ptr);
195	if (rhp->qpid2ptr)
196		free(rhp->cqid2ptr);
197	if (rhp->mmid2ptr)
198		free(rhp->cqid2ptr);
199	free(context);
200	return NULL;
201}
202
203static void c4iw_free_context(struct ibv_context *ibctx)
204{
205	struct c4iw_context *context = to_c4iw_context(ibctx);
206
207	if (context->status_page_size)
208		munmap(context->status_page, context->status_page_size);
209	free(context);
210}
211
212static struct ibv_device_ops c4iw_dev_ops = {
213	.alloc_context = c4iw_alloc_context,
214	.free_context = c4iw_free_context
215};
216
217#ifdef STALL_DETECTION
218
219int stall_to;
220
221static void dump_cq(struct c4iw_cq *chp)
222{
223	int i;
224
225	fprintf(stderr,
226 		"CQ: %p id %u queue %p cidx 0x%08x sw_queue %p sw_cidx %d sw_pidx %d sw_in_use %d depth %u error %u gen %d "
227		"cidx_inc %d bits_type_ts %016" PRIx64 " notempty %d\n", chp,
228                chp->cq.cqid, chp->cq.queue, chp->cq.cidx,
229	 	chp->cq.sw_queue, chp->cq.sw_cidx, chp->cq.sw_pidx, chp->cq.sw_in_use,
230                chp->cq.size, chp->cq.error, chp->cq.gen, chp->cq.cidx_inc, be64_to_cpu(chp->cq.bits_type_ts),
231		t4_cq_notempty(&chp->cq) || (chp->iq ? t4_iq_notempty(chp->iq) : 0));
232
233	for (i=0; i < chp->cq.size; i++) {
234		u64 *p = (u64 *)(chp->cq.queue + i);
235
236		fprintf(stderr, "%02x: %016" PRIx64 " %016" PRIx64, i, be64_to_cpu(p[0]), be64_to_cpu(p[1]));
237		if (i == chp->cq.cidx)
238			fprintf(stderr, " <-- cidx\n");
239		else
240			fprintf(stderr, "\n");
241		p+= 2;
242		fprintf(stderr, "%02x: %016" PRIx64 " %016" PRIx64 "\n", i, be64_to_cpu(p[0]), be64_to_cpu(p[1]));
243		p+= 2;
244		fprintf(stderr, "%02x: %016" PRIx64 " %016" PRIx64 "\n", i, be64_to_cpu(p[0]), be64_to_cpu(p[1]));
245		p+= 2;
246		fprintf(stderr, "%02x: %016" PRIx64 " %016" PRIx64 "\n", i, be64_to_cpu(p[0]), be64_to_cpu(p[1]));
247		p+= 2;
248	}
249}
250
251static void dump_qp(struct c4iw_qp *qhp)
252{
253	int i;
254	int j;
255	struct t4_swsqe *swsqe;
256	struct t4_swrqe *swrqe;
257	u16 cidx, pidx;
258	u64 *p;
259
260	fprintf(stderr,
261		"QP: %p id %u error %d flushed %d qid_mask 0x%x\n"
262		"    SQ: id %u queue %p sw_queue %p cidx %u pidx %u in_use %u wq_pidx %u depth %u flags 0x%x flush_cidx %d\n"
263		"    RQ: id %u queue %p sw_queue %p cidx %u pidx %u in_use %u depth %u\n",
264		qhp,
265		qhp->wq.sq.qid,
266		qhp->wq.error,
267		qhp->wq.flushed,
268		qhp->wq.qid_mask,
269		qhp->wq.sq.qid,
270		qhp->wq.sq.queue,
271		qhp->wq.sq.sw_sq,
272		qhp->wq.sq.cidx,
273		qhp->wq.sq.pidx,
274		qhp->wq.sq.in_use,
275		qhp->wq.sq.wq_pidx,
276		qhp->wq.sq.size,
277		qhp->wq.sq.flags,
278		qhp->wq.sq.flush_cidx,
279		qhp->wq.rq.qid,
280		qhp->wq.rq.queue,
281		qhp->wq.rq.sw_rq,
282		qhp->wq.rq.cidx,
283		qhp->wq.rq.pidx,
284		qhp->wq.rq.in_use,
285		qhp->wq.rq.size);
286	cidx = qhp->wq.sq.cidx;
287	pidx = qhp->wq.sq.pidx;
288	if (cidx != pidx)
289		fprintf(stderr, "SQ: \n");
290	while (cidx != pidx) {
291		swsqe = &qhp->wq.sq.sw_sq[cidx];
292		fprintf(stderr, "%04u: wr_id %016" PRIx64
293			" sq_wptr %08x read_len %u opcode 0x%x "
294			"complete %u signaled %u cqe %016" PRIx64 " %016" PRIx64 " %016" PRIx64 " %016" PRIx64 "\n",
295			cidx,
296			swsqe->wr_id,
297			swsqe->idx,
298			swsqe->read_len,
299			swsqe->opcode,
300			swsqe->complete,
301			swsqe->signaled,
302			cpu_to_be64(swsqe->cqe.u.flits[0]),
303			cpu_to_be64(swsqe->cqe.u.flits[1]),
304			cpu_to_be64((u64)swsqe->cqe.reserved),
305			cpu_to_be64(swsqe->cqe.bits_type_ts));
306		if (++cidx == qhp->wq.sq.size)
307			cidx = 0;
308	}
309
310	fprintf(stderr, "SQ WQ: \n");
311	p = (u64 *)qhp->wq.sq.queue;
312	for (i=0; i < qhp->wq.sq.size * T4_SQ_NUM_SLOTS; i++) {
313		for (j=0; j < T4_EQ_ENTRY_SIZE / 16; j++) {
314			fprintf(stderr, "%04u %016" PRIx64 " %016" PRIx64 " ",
315				i, ntohll(p[0]), ntohll(p[1]));
316			if (j == 0 && i == qhp->wq.sq.wq_pidx)
317				fprintf(stderr, " <-- pidx");
318			fprintf(stderr, "\n");
319			p += 2;
320		}
321	}
322	cidx = qhp->wq.rq.cidx;
323	pidx = qhp->wq.rq.pidx;
324	if (cidx != pidx)
325		fprintf(stderr, "RQ: \n");
326	while (cidx != pidx) {
327		swrqe = &qhp->wq.rq.sw_rq[cidx];
328		fprintf(stderr, "%04u: wr_id %016" PRIx64 "\n",
329			cidx,
330			swrqe->wr_id );
331		if (++cidx == qhp->wq.rq.size)
332			cidx = 0;
333	}
334
335	fprintf(stderr, "RQ WQ: \n");
336	p = (u64 *)qhp->wq.rq.queue;
337	for (i=0; i < qhp->wq.rq.size * T4_RQ_NUM_SLOTS; i++) {
338		for (j=0; j < T4_EQ_ENTRY_SIZE / 16; j++) {
339			fprintf(stderr, "%04u %016" PRIx64 " %016" PRIx64 " ",
340				i, ntohll(p[0]), ntohll(p[1]));
341			if (j == 0 && i == qhp->wq.rq.pidx)
342				fprintf(stderr, " <-- pidx");
343			if (j == 0 && i == qhp->wq.rq.cidx)
344				fprintf(stderr, " <-- cidx");
345			fprintf(stderr, "\n");
346			p+=2;
347		}
348	}
349}
350
351void dump_state()
352{
353	struct c4iw_dev *dev;
354	int i;
355
356	fprintf(stderr, "STALL DETECTED:\n");
357	SLIST_FOREACH(dev, &devices, list) {
358		//pthread_spin_lock(&dev->lock);
359		fprintf(stderr, "Device %s\n", dev->ibv_dev.name);
360		for (i=0; i < dev->max_cq; i++) {
361			if (dev->cqid2ptr[i]) {
362				struct c4iw_cq *chp = dev->cqid2ptr[i];
363				//pthread_spin_lock(&chp->lock);
364				dump_cq(chp);
365				//pthread_spin_unlock(&chp->lock);
366			}
367		}
368		for (i=0; i < dev->max_qp; i++) {
369			if (dev->qpid2ptr[i]) {
370				struct c4iw_qp *qhp = dev->qpid2ptr[i];
371				//pthread_spin_lock(&qhp->lock);
372				dump_qp(qhp);
373				//pthread_spin_unlock(&qhp->lock);
374			}
375		}
376		//pthread_spin_unlock(&dev->lock);
377	}
378	fprintf(stderr, "DUMP COMPLETE:\n");
379	fflush(stderr);
380}
381#endif /* end of STALL_DETECTION */
382
383/*
384 * c4iw_abi_version is used to store ABI for iw_cxgb4 so the user mode library
385 * can know if the driver supports the kernel mode db ringing.
386 */
387int c4iw_abi_version = 1;
388
389static struct ibv_device *cxgb4_driver_init(const char *uverbs_sys_path,
390					    int abi_version)
391{
392	char devstr[IBV_SYSFS_PATH_MAX], ibdev[16], value[128], *cp;
393	char t5nexstr[IBV_SYSFS_PATH_MAX];
394	struct c4iw_dev *dev;
395	unsigned vendor, device, fw_maj, fw_min;
396	int i;
397	char devnum=0;
398        char ib_param[16];
399
400#ifndef __linux__
401	if (ibv_read_sysfs_file(uverbs_sys_path, "ibdev",
402				ibdev, sizeof ibdev) < 0)
403		return NULL;
404	/*
405	 * Extract the non-numeric part of ibdev
406	 * say "t5nex0" -> devname=="t5nex", devnum=0
407	 */
408	if (strstr(ibdev,"t5nex")) {
409		devnum = atoi(ibdev+strlen("t5nex"));
410		sprintf(t5nexstr, "/dev/t5nex/%d", devnum);
411	} else
412		return NULL;
413
414	if (ibv_read_sysfs_file(t5nexstr, "\%pnpinfo",
415				value, sizeof value) < 0)
416		return NULL;
417	else {
418		if (strstr(value,"vendor=")) {
419			strncpy(ib_param, strstr(value,"vendor=")+strlen("vendor="),6);
420			sscanf(ib_param,"%i",&vendor);
421		}
422
423		if (strstr(value,"device=")) {
424			strncpy(ib_param, strstr(value,"device=")+strlen("device="),6);
425			sscanf(ib_param,"%i",&device);
426		}
427	}
428#else
429	if (ibv_read_sysfs_file(uverbs_sys_path, "device/vendor",
430				value, sizeof value) < 0)
431		return NULL;
432	sscanf(value, "%i", &vendor);
433
434	if (ibv_read_sysfs_file(uverbs_sys_path, "device/device",
435				value, sizeof value) < 0)
436		return NULL;
437	sscanf(value, "%i", &device);
438#endif
439
440	for (i = 0; i < sizeof hca_table / sizeof hca_table[0]; ++i)
441		if (vendor == hca_table[i].vendor &&
442		    device == hca_table[i].device)
443			goto found;
444
445	return NULL;
446
447found:
448	c4iw_abi_version = abi_version;
449
450
451#ifndef __linux__
452	if (ibv_read_sysfs_file(t5nexstr, "firmware_version",
453				value, sizeof value) < 0)
454		return NULL;
455#else
456	/*
457	 * Verify that the firmware major number matches.  Major number
458	 * mismatches are fatal.  Minor number mismatches are tolerated.
459	 */
460	if (ibv_read_sysfs_file(uverbs_sys_path, "ibdev",
461				ibdev, sizeof ibdev) < 0)
462		return NULL;
463
464	memset(devstr, 0, sizeof devstr);
465	snprintf(devstr, sizeof devstr, "%s/class/infiniband/%s",
466		 ibv_get_sysfs_path(), ibdev);
467	if (ibv_read_sysfs_file(devstr, "fw_ver", value, sizeof value) < 0)
468		return NULL;
469#endif
470
471	cp = strtok(value+1, ".");
472	sscanf(cp, "%i", &fw_maj);
473	cp = strtok(NULL, ".");
474	sscanf(cp, "%i", &fw_min);
475
476	if (fw_maj < FW_MAJ) {
477		fprintf(stderr, "libcxgb4: Fatal firmware version mismatch.  "
478			"Firmware major number is %u and libcxgb4 needs %u.\n",
479			fw_maj, FW_MAJ);
480		fflush(stderr);
481		return NULL;
482	}
483
484	DBGLOG("libcxgb4");
485
486	if (fw_min < FW_MIN) {
487		PDBG("libcxgb4: non-fatal firmware version mismatch.  "
488			"Firmware minor number is %u and libcxgb4 needs %u.\n",
489			fw_maj, FW_MAJ);
490		fflush(stderr);
491	}
492
493	PDBG("%s found vendor %d device %d type %d\n",
494		__FUNCTION__, vendor, device,
495		CHELSIO_PCI_ID_CHIP_VERSION(hca_table[i].device));
496
497	dev = calloc(1, sizeof *dev);
498	if (!dev) {
499		return NULL;
500	}
501
502	pthread_spin_init(&dev->lock, PTHREAD_PROCESS_PRIVATE);
503	dev->ibv_dev.ops = c4iw_dev_ops;
504	dev->chip_version = CHELSIO_PCI_ID_CHIP_VERSION(hca_table[i].device);
505	dev->abi_version = abi_version;
506
507	PDBG("%s device claimed\n", __FUNCTION__);
508	SLIST_INSERT_HEAD(&devices, dev, list);
509#ifdef STALL_DETECTION
510{
511	char *c = getenv("CXGB4_STALL_TIMEOUT");
512	if (c) {
513		stall_to = strtol(c, NULL, 0);
514		if (errno || stall_to < 0)
515			stall_to = 0;
516	}
517}
518#endif
519{
520	char *c = getenv("CXGB4_MA_WR");
521	if (c) {
522		ma_wr = strtol(c, NULL, 0);
523		if (ma_wr != 1)
524			ma_wr = 0;
525	}
526}
527{
528	char *c = getenv("T5_ENABLE_WC");
529	if (c) {
530		t5_en_wc = strtol(c, NULL, 0);
531		if (t5_en_wc != 1)
532			t5_en_wc = 0;
533	}
534}
535
536	return &dev->ibv_dev;
537}
538
539static __attribute__((constructor)) void cxgb4_register_driver(void)
540{
541	c4iw_page_size = sysconf(_SC_PAGESIZE);
542	c4iw_page_shift = long_log2(c4iw_page_size);
543	c4iw_page_mask = ~(c4iw_page_size - 1);
544	ibv_register_driver("cxgb4", cxgb4_driver_init);
545}
546
547#ifdef STATS
548void __attribute__ ((destructor)) cs_fini(void);
549void  __attribute__ ((destructor)) cs_fini(void)
550{
551	syslog(LOG_NOTICE, "cxgb4 stats - sends %lu recv %lu read %lu "
552	       "write %lu arm %lu cqe %lu mr %lu qp %lu cq %lu\n",
553	       c4iw_stats.send, c4iw_stats.recv, c4iw_stats.read,
554	       c4iw_stats.write, c4iw_stats.arm, c4iw_stats.cqe,
555	       c4iw_stats.mr, c4iw_stats.qp, c4iw_stats.cq);
556}
557#endif
558