1/*
2 * Copyright (c) 2006 Intel Corporation.  All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses.  You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 *     Redistribution and use in source and binary forms, with or
11 *     without modification, are permitted provided that the following
12 *     conditions are met:
13 *
14 *      - Redistributions of source code must retain the above
15 *        copyright notice, this list of conditions and the following
16 *        disclaimer.
17 *
18 *      - Redistributions in binary form must reproduce the above
19 *        copyright notice, this list of conditions and the following
20 *        disclaimer in the documentation and/or other materials
21 *        provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 */
32
33#include <linux/dma-mapping.h>
34#include <linux/err.h>
35#include <linux/interrupt.h>
36#include <linux/rbtree.h>
37#include <linux/mutex.h>
38#include <linux/spinlock.h>
39#include <linux/pci.h>
40#include <linux/miscdevice.h>
41#include <linux/random.h>
42
43#include <rdma/ib_cache.h>
44#include <rdma/ib_sa.h>
45#include "sa.h"
46
47MODULE_AUTHOR("Sean Hefty");
48MODULE_DESCRIPTION("InfiniBand subnet administration caching");
49MODULE_LICENSE("Dual BSD/GPL");
50
51enum {
52	SA_DB_MAX_PATHS_PER_DEST = 0x7F,
53	SA_DB_MIN_RETRY_TIMER	 = 4000,  /*   4 sec */
54	SA_DB_MAX_RETRY_TIMER	 = 256000 /* 256 sec */
55};
56
57static unsigned long paths_per_dest = 0;
58static char subscribe_inform_info = 1;
59static unsigned long retry_timer = SA_DB_MIN_RETRY_TIMER;
60
61enum sa_db_lookup_method {
62	SA_DB_LOOKUP_LEAST_USED,
63	SA_DB_LOOKUP_RANDOM
64};
65
66static unsigned long lookup_method;
67
68static void sa_db_add_dev(struct ib_device *device);
69static void sa_db_remove_dev(struct ib_device *device);
70
71static struct ib_client sa_db_client = {
72	.name   = "local_sa",
73	.add    = sa_db_add_dev,
74	.remove = sa_db_remove_dev
75};
76
77static LIST_HEAD(dev_list);
78static DEFINE_MUTEX(lock);
79static rwlock_t rwlock;
80static struct workqueue_struct *sa_wq;
81static struct ib_sa_client sa_client;
82
83enum sa_db_state {
84	SA_DB_IDLE,
85	SA_DB_REFRESH,
86	SA_DB_DESTROY
87};
88
89struct sa_db_port {
90	struct sa_db_device	*dev;
91	struct ib_mad_agent	*agent;
92	/* Limit number of outstanding MADs to SA to reduce SA flooding */
93	struct ib_mad_send_buf	*msg;
94	u16			sm_lid;
95	u8			sm_sl;
96	struct ib_inform_info	*in_info;
97	struct ib_inform_info	*out_info;
98	struct rb_root		paths;
99	struct list_head	update_list;
100	unsigned long		update_id;
101	enum sa_db_state	state;
102	struct work_struct	work;
103	union ib_gid		gid;
104	int			port_num;
105};
106
107struct sa_db_device {
108	struct list_head	list;
109	struct ib_device	*device;
110	struct ib_event_handler event_handler;
111	int			start_port;
112	int			port_count;
113	struct sa_db_port	port[0];
114};
115
116struct ib_sa_iterator {
117	struct ib_sa_iterator	*next;
118};
119
120struct ib_sa_attr_iter {
121	struct ib_sa_iterator	*iter;
122	unsigned long		flags;
123};
124
125struct ib_sa_attr_list {
126	struct ib_sa_iterator	iter;
127	struct ib_sa_iterator	*tail;
128	int			update_id;
129	union ib_gid		gid;
130	struct rb_node		node;
131};
132
133struct ib_path_rec_info {
134	struct ib_sa_iterator	iter; /* keep first */
135	struct ib_sa_path_rec	rec;
136	unsigned long		lookups;
137};
138
139struct ib_sa_mad_iter {
140	struct ib_mad_recv_wc	*recv_wc;
141	struct ib_mad_recv_buf	*recv_buf;
142	int			attr_size;
143	int			attr_offset;
144	int			data_offset;
145	int			data_left;
146	void			*attr;
147	u8			attr_data[0];
148};
149
150enum sa_update_type {
151	SA_UPDATE_FULL,
152	SA_UPDATE_ADD,
153	SA_UPDATE_REMOVE
154};
155
156struct update_info {
157	struct list_head	list;
158	union ib_gid		gid;
159	enum sa_update_type	type;
160};
161
162struct sa_path_request {
163	struct work_struct	work;
164	struct ib_sa_client	*client;
165	void			(*callback)(int, struct ib_sa_path_rec *, void *);
166	void			*context;
167	struct ib_sa_path_rec	path_rec;
168};
169
170static void process_updates(struct sa_db_port *port);
171
172static void free_attr_list(struct ib_sa_attr_list *attr_list)
173{
174	struct ib_sa_iterator *cur;
175
176	for (cur = attr_list->iter.next; cur; cur = attr_list->iter.next) {
177		attr_list->iter.next = cur->next;
178		kfree(cur);
179	}
180	attr_list->tail = &attr_list->iter;
181}
182
183static void remove_attr(struct rb_root *root, struct ib_sa_attr_list *attr_list)
184{
185	rb_erase(&attr_list->node, root);
186	free_attr_list(attr_list);
187	kfree(attr_list);
188}
189
190static void remove_all_attrs(struct rb_root *root)
191{
192	struct rb_node *node, *next_node;
193	struct ib_sa_attr_list *attr_list;
194
195	write_lock_irq(&rwlock);
196	for (node = rb_first(root); node; node = next_node) {
197		next_node = rb_next(node);
198		attr_list = rb_entry(node, struct ib_sa_attr_list, node);
199		remove_attr(root, attr_list);
200	}
201	write_unlock_irq(&rwlock);
202}
203
204static void remove_old_attrs(struct rb_root *root, unsigned long update_id)
205{
206	struct rb_node *node, *next_node;
207	struct ib_sa_attr_list *attr_list;
208
209	write_lock_irq(&rwlock);
210	for (node = rb_first(root); node; node = next_node) {
211		next_node = rb_next(node);
212		attr_list = rb_entry(node, struct ib_sa_attr_list, node);
213		if (attr_list->update_id != update_id)
214			remove_attr(root, attr_list);
215	}
216	write_unlock_irq(&rwlock);
217}
218
219static struct ib_sa_attr_list *insert_attr_list(struct rb_root *root,
220						struct ib_sa_attr_list *attr_list)
221{
222	struct rb_node **link = &root->rb_node;
223	struct rb_node *parent = NULL;
224	struct ib_sa_attr_list *cur_attr_list;
225	int cmp;
226
227	while (*link) {
228		parent = *link;
229		cur_attr_list = rb_entry(parent, struct ib_sa_attr_list, node);
230		cmp = memcmp(&cur_attr_list->gid, &attr_list->gid,
231			     sizeof attr_list->gid);
232		if (cmp < 0)
233			link = &(*link)->rb_left;
234		else if (cmp > 0)
235			link = &(*link)->rb_right;
236		else
237			return cur_attr_list;
238	}
239	rb_link_node(&attr_list->node, parent, link);
240	rb_insert_color(&attr_list->node, root);
241	return NULL;
242}
243
244static struct ib_sa_attr_list *find_attr_list(struct rb_root *root, u8 *gid)
245{
246	struct rb_node *node = root->rb_node;
247	struct ib_sa_attr_list *attr_list;
248	int cmp;
249
250	while (node) {
251		attr_list = rb_entry(node, struct ib_sa_attr_list, node);
252		cmp = memcmp(&attr_list->gid, gid, sizeof attr_list->gid);
253		if (cmp < 0)
254			node = node->rb_left;
255		else if (cmp > 0)
256			node = node->rb_right;
257		else
258			return attr_list;
259	}
260	return NULL;
261}
262
263static int insert_attr(struct rb_root *root, unsigned long update_id, void *key,
264		       struct ib_sa_iterator *iter)
265{
266	struct ib_sa_attr_list *attr_list;
267	void *err;
268
269	write_lock_irq(&rwlock);
270	attr_list = find_attr_list(root, key);
271	if (!attr_list) {
272		write_unlock_irq(&rwlock);
273		attr_list = kmalloc(sizeof *attr_list, GFP_KERNEL);
274		if (!attr_list)
275			return -ENOMEM;
276
277		attr_list->iter.next = NULL;
278		attr_list->tail = &attr_list->iter;
279		attr_list->update_id = update_id;
280		memcpy(attr_list->gid.raw, key, sizeof attr_list->gid);
281
282		write_lock_irq(&rwlock);
283		err = insert_attr_list(root, attr_list);
284		if (err) {
285			write_unlock_irq(&rwlock);
286			kfree(attr_list);
287			return PTR_ERR(err);
288		}
289	} else if (attr_list->update_id != update_id) {
290		free_attr_list(attr_list);
291		attr_list->update_id = update_id;
292	}
293
294	attr_list->tail->next = iter;
295	iter->next = NULL;
296	attr_list->tail = iter;
297	write_unlock_irq(&rwlock);
298	return 0;
299}
300
301static struct ib_sa_mad_iter *ib_sa_iter_create(struct ib_mad_recv_wc *mad_recv_wc)
302{
303	struct ib_sa_mad_iter *iter;
304	struct ib_sa_mad *mad = (struct ib_sa_mad *) mad_recv_wc->recv_buf.mad;
305	int attr_size, attr_offset;
306
307	attr_offset = be16_to_cpu(mad->sa_hdr.attr_offset) * 8;
308	attr_size = 64;		/* path record length */
309	if (attr_offset < attr_size)
310		return ERR_PTR(-EINVAL);
311
312	iter = kzalloc(sizeof *iter + attr_size, GFP_KERNEL);
313	if (!iter)
314		return ERR_PTR(-ENOMEM);
315
316	iter->data_left = mad_recv_wc->mad_len - IB_MGMT_SA_HDR;
317	iter->recv_wc = mad_recv_wc;
318	iter->recv_buf = &mad_recv_wc->recv_buf;
319	iter->attr_offset = attr_offset;
320	iter->attr_size = attr_size;
321	return iter;
322}
323
324static void ib_sa_iter_free(struct ib_sa_mad_iter *iter)
325{
326	kfree(iter);
327}
328
329static void *ib_sa_iter_next(struct ib_sa_mad_iter *iter)
330{
331	struct ib_sa_mad *mad;
332	int left, offset = 0;
333
334	while (iter->data_left >= iter->attr_offset) {
335		while (iter->data_offset < IB_MGMT_SA_DATA) {
336			mad = (struct ib_sa_mad *) iter->recv_buf->mad;
337
338			left = IB_MGMT_SA_DATA - iter->data_offset;
339			if (left < iter->attr_size) {
340				/* copy first piece of the attribute */
341				iter->attr = &iter->attr_data;
342				memcpy(iter->attr,
343				       &mad->data[iter->data_offset], left);
344				offset = left;
345				break;
346			} else if (offset) {
347				/* copy the second piece of the attribute */
348				memcpy(iter->attr + offset, &mad->data[0],
349				       iter->attr_size - offset);
350				iter->data_offset = iter->attr_size - offset;
351				offset = 0;
352			} else {
353				iter->attr = &mad->data[iter->data_offset];
354				iter->data_offset += iter->attr_size;
355			}
356
357			iter->data_left -= iter->attr_offset;
358			goto out;
359		}
360		iter->data_offset = 0;
361		iter->recv_buf = list_entry(iter->recv_buf->list.next,
362					    struct ib_mad_recv_buf, list);
363	}
364	iter->attr = NULL;
365out:
366	return iter->attr;
367}
368
369/*
370 * Copy path records from a received response and insert them into our cache.
371 * A path record in the MADs are in network order, packed, and may
372 * span multiple MAD buffers, just to make our life hard.
373 */
374static void update_path_db(struct sa_db_port *port,
375			   struct ib_mad_recv_wc *mad_recv_wc,
376			   enum sa_update_type type)
377{
378	struct ib_sa_mad_iter *iter;
379	struct ib_path_rec_info *path_info;
380	void *attr;
381	int ret;
382
383	iter = ib_sa_iter_create(mad_recv_wc);
384	if (IS_ERR(iter))
385		return;
386
387	port->update_id += (type == SA_UPDATE_FULL);
388
389	while ((attr = ib_sa_iter_next(iter)) &&
390	       (path_info = kmalloc(sizeof *path_info, GFP_KERNEL))) {
391
392		ib_sa_unpack_attr(&path_info->rec, attr, IB_SA_ATTR_PATH_REC);
393
394		ret = insert_attr(&port->paths, port->update_id,
395				  path_info->rec.dgid.raw, &path_info->iter);
396		if (ret) {
397			kfree(path_info);
398			break;
399		}
400	}
401	ib_sa_iter_free(iter);
402
403	if (type == SA_UPDATE_FULL)
404		remove_old_attrs(&port->paths, port->update_id);
405}
406
407static struct ib_mad_send_buf *get_sa_msg(struct sa_db_port *port,
408					  struct update_info *update)
409{
410	struct ib_ah_attr ah_attr;
411	struct ib_mad_send_buf *msg;
412
413	msg = ib_create_send_mad(port->agent, 1, 0, 0, IB_MGMT_SA_HDR,
414				 IB_MGMT_SA_DATA, GFP_KERNEL);
415	if (IS_ERR(msg))
416		return NULL;
417
418	memset(&ah_attr, 0, sizeof ah_attr);
419	ah_attr.dlid = port->sm_lid;
420	ah_attr.sl = port->sm_sl;
421	ah_attr.port_num = port->port_num;
422
423	msg->ah = ib_create_ah(port->agent->qp->pd, &ah_attr);
424	if (IS_ERR(msg->ah)) {
425		ib_free_send_mad(msg);
426		return NULL;
427	}
428
429	msg->timeout_ms = retry_timer;
430	msg->retries = 0;
431	msg->context[0] = port;
432	msg->context[1] = update;
433	return msg;
434}
435
436static __be64 form_tid(u32 hi_tid)
437{
438	static atomic_t tid;
439	return cpu_to_be64((((u64) hi_tid) << 32) |
440			   ((u32) atomic_inc_return(&tid)));
441}
442
443static void format_path_req(struct sa_db_port *port,
444			    struct update_info *update,
445			    struct ib_mad_send_buf *msg)
446{
447	struct ib_sa_mad *mad = msg->mad;
448	struct ib_sa_path_rec path_rec;
449
450	mad->mad_hdr.base_version  = IB_MGMT_BASE_VERSION;
451	mad->mad_hdr.mgmt_class	   = IB_MGMT_CLASS_SUBN_ADM;
452	mad->mad_hdr.class_version = IB_SA_CLASS_VERSION;
453	mad->mad_hdr.method	   = IB_SA_METHOD_GET_TABLE;
454	mad->mad_hdr.attr_id	   = cpu_to_be16(IB_SA_ATTR_PATH_REC);
455	mad->mad_hdr.tid	   = form_tid(msg->mad_agent->hi_tid);
456
457	mad->sa_hdr.comp_mask = IB_SA_PATH_REC_SGID | IB_SA_PATH_REC_NUMB_PATH;
458
459	path_rec.sgid = port->gid;
460	path_rec.numb_path = (u8) paths_per_dest;
461
462	if (update->type == SA_UPDATE_ADD) {
463		mad->sa_hdr.comp_mask |= IB_SA_PATH_REC_DGID;
464		memcpy(&path_rec.dgid, &update->gid, sizeof path_rec.dgid);
465	}
466
467	ib_sa_pack_attr(mad->data, &path_rec, IB_SA_ATTR_PATH_REC);
468}
469
470static int send_query(struct sa_db_port *port,
471		      struct update_info *update)
472{
473	int ret;
474
475	port->msg = get_sa_msg(port, update);
476	if (!port->msg)
477		return -ENOMEM;
478
479	format_path_req(port, update, port->msg);
480
481	ret = ib_post_send_mad(port->msg, NULL);
482	if (ret)
483		goto err;
484
485	return 0;
486
487err:
488	ib_destroy_ah(port->msg->ah);
489	ib_free_send_mad(port->msg);
490	return ret;
491}
492
493static void add_update(struct sa_db_port *port, u8 *gid,
494		       enum sa_update_type type)
495{
496	struct update_info *update;
497
498	update = kmalloc(sizeof *update, GFP_KERNEL);
499	if (update) {
500		if (gid)
501			memcpy(&update->gid, gid, sizeof update->gid);
502		update->type = type;
503		list_add(&update->list, &port->update_list);
504	}
505
506	if (port->state == SA_DB_IDLE) {
507		port->state = SA_DB_REFRESH;
508		process_updates(port);
509	}
510}
511
512static void clean_update_list(struct sa_db_port *port)
513{
514	struct update_info *update;
515
516	while (!list_empty(&port->update_list)) {
517		update = list_entry(port->update_list.next,
518				    struct update_info, list);
519		list_del(&update->list);
520		kfree(update);
521	}
522}
523
524static int notice_handler(int status, struct ib_inform_info *info,
525			  struct ib_sa_notice *notice)
526{
527	struct sa_db_port *port = info->context;
528	struct ib_sa_notice_data_gid *gid_data;
529	struct ib_inform_info **pinfo;
530	enum sa_update_type type;
531
532	if (info->trap_number == IB_SA_SM_TRAP_GID_IN_SERVICE) {
533		pinfo = &port->in_info;
534		type = SA_UPDATE_ADD;
535	} else {
536		pinfo = &port->out_info;
537		type = SA_UPDATE_REMOVE;
538	}
539
540	mutex_lock(&lock);
541	if (port->state == SA_DB_DESTROY || !*pinfo) {
542		mutex_unlock(&lock);
543		return 0;
544	}
545
546	if (notice) {
547		gid_data = (struct ib_sa_notice_data_gid *)
548			   &notice->data_details;
549		add_update(port, gid_data->gid, type);
550		mutex_unlock(&lock);
551	} else if (status == -ENETRESET) {
552		*pinfo = NULL;
553		mutex_unlock(&lock);
554	} else {
555		if (status)
556			*pinfo = ERR_PTR(-EINVAL);
557		port->state = SA_DB_IDLE;
558		clean_update_list(port);
559		mutex_unlock(&lock);
560		queue_work(sa_wq, &port->work);
561	}
562
563	return status;
564}
565
566static int reg_in_info(struct sa_db_port *port)
567{
568	int ret = 0;
569
570	port->in_info = ib_sa_register_inform_info(&sa_client,
571						   port->dev->device,
572						   port->port_num,
573						   IB_SA_SM_TRAP_GID_IN_SERVICE,
574						   GFP_KERNEL, notice_handler,
575						   port);
576	if (IS_ERR(port->in_info))
577		ret = PTR_ERR(port->in_info);
578
579	return ret;
580}
581
582static int reg_out_info(struct sa_db_port *port)
583{
584	int ret = 0;
585
586	port->out_info = ib_sa_register_inform_info(&sa_client,
587						    port->dev->device,
588						    port->port_num,
589						    IB_SA_SM_TRAP_GID_OUT_OF_SERVICE,
590						    GFP_KERNEL, notice_handler,
591						    port);
592	if (IS_ERR(port->out_info))
593		ret = PTR_ERR(port->out_info);
594
595	return ret;
596}
597
598static void unsubscribe_port(struct sa_db_port *port)
599{
600	if (port->in_info && !IS_ERR(port->in_info))
601		ib_sa_unregister_inform_info(port->in_info);
602
603	if (port->out_info && !IS_ERR(port->out_info))
604		ib_sa_unregister_inform_info(port->out_info);
605
606	port->out_info = NULL;
607	port->in_info = NULL;
608
609}
610
611static void cleanup_port(struct sa_db_port *port)
612{
613	unsubscribe_port(port);
614
615	clean_update_list(port);
616	remove_all_attrs(&port->paths);
617}
618
619static int update_port_info(struct sa_db_port *port)
620{
621	struct ib_port_attr port_attr;
622	int ret;
623
624	ret = ib_query_port(port->dev->device, port->port_num, &port_attr);
625	if (ret)
626		return ret;
627
628	if (port_attr.state != IB_PORT_ACTIVE)
629		return -ENODATA;
630
631        port->sm_lid = port_attr.sm_lid;
632	port->sm_sl = port_attr.sm_sl;
633	return 0;
634}
635
636static void process_updates(struct sa_db_port *port)
637{
638	struct update_info *update;
639	struct ib_sa_attr_list *attr_list;
640	int ret;
641
642	if (!paths_per_dest || update_port_info(port)) {
643		cleanup_port(port);
644		goto out;
645	}
646
647	/* Event registration is an optimization, so ignore failures. */
648	if (subscribe_inform_info) {
649		if (!port->out_info) {
650			ret = reg_out_info(port);
651			if (!ret)
652				return;
653		}
654
655		if (!port->in_info) {
656			ret = reg_in_info(port);
657			if (!ret)
658				return;
659		}
660	} else
661		unsubscribe_port(port);
662
663	while (!list_empty(&port->update_list)) {
664		update = list_entry(port->update_list.next,
665				    struct update_info, list);
666
667		if (update->type == SA_UPDATE_REMOVE) {
668			write_lock_irq(&rwlock);
669			attr_list = find_attr_list(&port->paths,
670						   update->gid.raw);
671			if (attr_list)
672				remove_attr(&port->paths, attr_list);
673			write_unlock_irq(&rwlock);
674		} else {
675			ret = send_query(port, update);
676			if (!ret)
677				return;
678
679		}
680		list_del(&update->list);
681		kfree(update);
682	}
683out:
684	port->state = SA_DB_IDLE;
685}
686
687static void refresh_port_db(struct sa_db_port *port)
688{
689	if (port->state == SA_DB_DESTROY)
690		return;
691
692	if (port->state == SA_DB_REFRESH) {
693		clean_update_list(port);
694		ib_cancel_mad(port->agent, port->msg);
695	}
696
697	add_update(port, NULL, SA_UPDATE_FULL);
698}
699
700static void refresh_dev_db(struct sa_db_device *dev)
701{
702	int i;
703
704	for (i = 0; i < dev->port_count; i++)
705		refresh_port_db(&dev->port[i]);
706}
707
708static void refresh_db(void)
709{
710	struct sa_db_device *dev;
711
712	list_for_each_entry(dev, &dev_list, list)
713		refresh_dev_db(dev);
714}
715
716static void port_work_handler(struct work_struct *work)
717{
718	struct sa_db_port *port;
719
720	port = container_of(work, typeof(*port), work);
721	mutex_lock(&lock);
722	refresh_port_db(port);
723	mutex_unlock(&lock);
724}
725
726static void handle_event(struct ib_event_handler *event_handler,
727			 struct ib_event *event)
728{
729	struct sa_db_device *dev;
730	struct sa_db_port *port;
731
732	dev = container_of(event_handler, typeof(*dev), event_handler);
733	port = &dev->port[event->element.port_num - dev->start_port];
734
735	switch (event->event) {
736	case IB_EVENT_PORT_ERR:
737	case IB_EVENT_LID_CHANGE:
738	case IB_EVENT_SM_CHANGE:
739	case IB_EVENT_CLIENT_REREGISTER:
740	case IB_EVENT_PKEY_CHANGE:
741	case IB_EVENT_PORT_ACTIVE:
742		queue_work(sa_wq, &port->work);
743		break;
744	default:
745		break;
746	}
747}
748
749static void ib_free_path_iter(struct ib_sa_attr_iter *iter)
750{
751	read_unlock_irqrestore(&rwlock, iter->flags);
752}
753
754static int ib_create_path_iter(struct ib_device *device, u8 port_num,
755			       union ib_gid *dgid, struct ib_sa_attr_iter *iter)
756{
757	struct sa_db_device *dev;
758	struct sa_db_port *port;
759	struct ib_sa_attr_list *list;
760
761	dev = ib_get_client_data(device, &sa_db_client);
762	if (!dev)
763		return -ENODEV;
764
765	port = &dev->port[port_num - dev->start_port];
766
767	read_lock_irqsave(&rwlock, iter->flags);
768	list = find_attr_list(&port->paths, dgid->raw);
769	if (!list) {
770		ib_free_path_iter(iter);
771		return -ENODATA;
772	}
773
774	iter->iter = &list->iter;
775	return 0;
776}
777
778static struct ib_sa_path_rec *ib_get_next_path(struct ib_sa_attr_iter *iter)
779{
780	struct ib_path_rec_info *next_path;
781
782	iter->iter = iter->iter->next;
783	if (iter->iter) {
784		next_path = container_of(iter->iter, struct ib_path_rec_info, iter);
785		return &next_path->rec;
786	} else
787		return NULL;
788}
789
790static int cmp_rec(struct ib_sa_path_rec *src,
791		   struct ib_sa_path_rec *dst, ib_sa_comp_mask comp_mask)
792{
793	/* DGID check already done */
794	if (comp_mask & IB_SA_PATH_REC_SGID &&
795	    memcmp(&src->sgid, &dst->sgid, sizeof src->sgid))
796		return -EINVAL;
797	if (comp_mask & IB_SA_PATH_REC_DLID && src->dlid != dst->dlid)
798		return -EINVAL;
799	if (comp_mask & IB_SA_PATH_REC_SLID && src->slid != dst->slid)
800		return -EINVAL;
801	if (comp_mask & IB_SA_PATH_REC_RAW_TRAFFIC &&
802	    src->raw_traffic != dst->raw_traffic)
803		return -EINVAL;
804
805	if (comp_mask & IB_SA_PATH_REC_FLOW_LABEL &&
806	    src->flow_label != dst->flow_label)
807		return -EINVAL;
808	if (comp_mask & IB_SA_PATH_REC_HOP_LIMIT &&
809	    src->hop_limit != dst->hop_limit)
810		return -EINVAL;
811	if (comp_mask & IB_SA_PATH_REC_TRAFFIC_CLASS &&
812	    src->traffic_class != dst->traffic_class)
813		return -EINVAL;
814	if (comp_mask & IB_SA_PATH_REC_REVERSIBLE &&
815	    dst->reversible && !src->reversible)
816		return -EINVAL;
817	/* Numb path check already done */
818	if (comp_mask & IB_SA_PATH_REC_PKEY && src->pkey != dst->pkey)
819		return -EINVAL;
820
821	if (comp_mask & IB_SA_PATH_REC_SL && src->sl != dst->sl)
822		return -EINVAL;
823
824	if (ib_sa_check_selector(comp_mask, IB_SA_PATH_REC_MTU_SELECTOR,
825				 IB_SA_PATH_REC_MTU, dst->mtu_selector,
826				 src->mtu, dst->mtu))
827		return -EINVAL;
828	if (ib_sa_check_selector(comp_mask, IB_SA_PATH_REC_RATE_SELECTOR,
829				 IB_SA_PATH_REC_RATE, dst->rate_selector,
830				 src->rate, dst->rate))
831		return -EINVAL;
832	if (ib_sa_check_selector(comp_mask,
833				 IB_SA_PATH_REC_PACKET_LIFE_TIME_SELECTOR,
834				 IB_SA_PATH_REC_PACKET_LIFE_TIME,
835				 dst->packet_life_time_selector,
836				 src->packet_life_time, dst->packet_life_time))
837		return -EINVAL;
838
839	return 0;
840}
841
842static struct ib_sa_path_rec *get_random_path(struct ib_sa_attr_iter *iter,
843					      struct ib_sa_path_rec *req_path,
844					      ib_sa_comp_mask comp_mask)
845{
846	struct ib_sa_path_rec *path, *rand_path = NULL;
847	int num, count = 0;
848
849	for (path = ib_get_next_path(iter); path;
850	     path = ib_get_next_path(iter)) {
851		if (!cmp_rec(path, req_path, comp_mask)) {
852			get_random_bytes(&num, sizeof num);
853			if ((num % ++count) == 0)
854				rand_path = path;
855		}
856	}
857
858	return rand_path;
859}
860
861static struct ib_sa_path_rec *get_next_path(struct ib_sa_attr_iter *iter,
862					    struct ib_sa_path_rec *req_path,
863					    ib_sa_comp_mask comp_mask)
864{
865	struct ib_path_rec_info *cur_path, *next_path = NULL;
866	struct ib_sa_path_rec *path;
867	unsigned long lookups = ~0;
868
869	for (path = ib_get_next_path(iter); path;
870	     path = ib_get_next_path(iter)) {
871		if (!cmp_rec(path, req_path, comp_mask)) {
872
873			cur_path = container_of(iter->iter, struct ib_path_rec_info,
874						iter);
875			if (cur_path->lookups < lookups) {
876				lookups = cur_path->lookups;
877				next_path = cur_path;
878			}
879		}
880	}
881
882	if (next_path) {
883		next_path->lookups++;
884		return &next_path->rec;
885	} else
886		return NULL;
887}
888
889static void report_path(struct work_struct *work)
890{
891	struct sa_path_request *req;
892
893	req = container_of(work, struct sa_path_request, work);
894	req->callback(0, &req->path_rec, req->context);
895	ib_sa_client_put(req->client);
896	kfree(req);
897}
898
899/**
900 * ib_sa_path_rec_get - Start a Path get query
901 * @client:SA client
902 * @device:device to send query on
903 * @port_num: port number to send query on
904 * @rec:Path Record to send in query
905 * @comp_mask:component mask to send in query
906 * @timeout_ms:time to wait for response
907 * @gfp_mask:GFP mask to use for internal allocations
908 * @callback:function called when query completes, times out or is
909 * canceled
910 * @context:opaque user context passed to callback
911 * @sa_query:query context, used to cancel query
912 *
913 * Send a Path Record Get query to the SA to look up a path.  The
914 * callback function will be called when the query completes (or
915 * fails); status is 0 for a successful response, -EINTR if the query
916 * is canceled, -ETIMEDOUT is the query timed out, or -EIO if an error
917 * occurred sending the query.  The resp parameter of the callback is
918 * only valid if status is 0.
919 *
920 * If the return value of ib_sa_path_rec_get() is negative, it is an
921 * error code.  Otherwise it is a query ID that can be used to cancel
922 * the query.
923 */
924int ib_sa_path_rec_get(struct ib_sa_client *client,
925		       struct ib_device *device, u8 port_num,
926		       struct ib_sa_path_rec *rec,
927		       ib_sa_comp_mask comp_mask,
928		       int timeout_ms, gfp_t gfp_mask,
929		       void (*callback)(int status,
930					struct ib_sa_path_rec *resp,
931					void *context),
932		       void *context,
933		       struct ib_sa_query **sa_query)
934{
935	struct sa_path_request *req;
936	struct ib_sa_attr_iter iter;
937	struct ib_sa_path_rec *path_rec;
938	int ret;
939
940	if (!paths_per_dest)
941		goto query_sa;
942
943	if (!(comp_mask & IB_SA_PATH_REC_DGID) ||
944	    !(comp_mask & IB_SA_PATH_REC_NUMB_PATH) || rec->numb_path != 1)
945		goto query_sa;
946
947	req = kmalloc(sizeof *req, gfp_mask);
948	if (!req)
949		goto query_sa;
950
951	ret = ib_create_path_iter(device, port_num, &rec->dgid, &iter);
952	if (ret)
953		goto free_req;
954
955	if (lookup_method == SA_DB_LOOKUP_RANDOM)
956		path_rec = get_random_path(&iter, rec, comp_mask);
957	else
958		path_rec = get_next_path(&iter, rec, comp_mask);
959
960	if (!path_rec)
961		goto free_iter;
962
963	memcpy(&req->path_rec, path_rec, sizeof *path_rec);
964	ib_free_path_iter(&iter);
965
966	INIT_WORK(&req->work, report_path);
967	req->client = client;
968	req->callback = callback;
969	req->context = context;
970
971	ib_sa_client_get(client);
972	queue_work(sa_wq, &req->work);
973	*sa_query = ERR_PTR(-EEXIST);
974	return 0;
975
976free_iter:
977	ib_free_path_iter(&iter);
978free_req:
979	kfree(req);
980query_sa:
981	return ib_sa_path_rec_query(client, device, port_num, rec, comp_mask,
982				    timeout_ms, gfp_mask, callback, context,
983				    sa_query);
984}
985EXPORT_SYMBOL(ib_sa_path_rec_get);
986
987static void recv_handler(struct ib_mad_agent *mad_agent,
988			 struct ib_mad_recv_wc *mad_recv_wc)
989{
990	struct sa_db_port *port;
991	struct update_info *update;
992	struct ib_mad_send_buf *msg;
993	enum sa_update_type type;
994
995	msg = (struct ib_mad_send_buf *) (unsigned long) mad_recv_wc->wc->wr_id;
996	port = msg->context[0];
997	update = msg->context[1];
998
999	mutex_lock(&lock);
1000	if (port->state == SA_DB_DESTROY ||
1001	    update != list_entry(port->update_list.next,
1002				 struct update_info, list)) {
1003		mutex_unlock(&lock);
1004	} else {
1005		type = update->type;
1006		mutex_unlock(&lock);
1007		update_path_db(mad_agent->context, mad_recv_wc, type);
1008	}
1009
1010	ib_free_recv_mad(mad_recv_wc);
1011}
1012
1013static void send_handler(struct ib_mad_agent *agent,
1014			 struct ib_mad_send_wc *mad_send_wc)
1015{
1016	struct ib_mad_send_buf *msg;
1017	struct sa_db_port *port;
1018	struct update_info *update;
1019	int ret;
1020
1021	msg = mad_send_wc->send_buf;
1022	port = msg->context[0];
1023	update = msg->context[1];
1024
1025	mutex_lock(&lock);
1026	if (port->state == SA_DB_DESTROY)
1027		goto unlock;
1028
1029	if (update == list_entry(port->update_list.next,
1030				 struct update_info, list)) {
1031
1032		if (mad_send_wc->status == IB_WC_RESP_TIMEOUT_ERR &&
1033		    msg->timeout_ms < SA_DB_MAX_RETRY_TIMER) {
1034
1035			msg->timeout_ms <<= 1;
1036			ret = ib_post_send_mad(msg, NULL);
1037			if (!ret) {
1038				mutex_unlock(&lock);
1039				return;
1040			}
1041		}
1042		list_del(&update->list);
1043		kfree(update);
1044	}
1045	process_updates(port);
1046unlock:
1047	mutex_unlock(&lock);
1048
1049	ib_destroy_ah(msg->ah);
1050	ib_free_send_mad(msg);
1051}
1052
1053static int init_port(struct sa_db_device *dev, int port_num)
1054{
1055	struct sa_db_port *port;
1056	int ret;
1057
1058	port = &dev->port[port_num - dev->start_port];
1059	port->dev = dev;
1060	port->port_num = port_num;
1061	INIT_WORK(&port->work, port_work_handler);
1062	port->paths = RB_ROOT;
1063	INIT_LIST_HEAD(&port->update_list);
1064
1065	ret = ib_get_cached_gid(dev->device, port_num, 0, &port->gid);
1066	if (ret)
1067		return ret;
1068
1069	port->agent = ib_register_mad_agent(dev->device, port_num, IB_QPT_GSI,
1070					    NULL, IB_MGMT_RMPP_VERSION,
1071					    send_handler, recv_handler, port);
1072	if (IS_ERR(port->agent))
1073		ret = PTR_ERR(port->agent);
1074
1075	return ret;
1076}
1077
1078static void destroy_port(struct sa_db_port *port)
1079{
1080	mutex_lock(&lock);
1081	port->state = SA_DB_DESTROY;
1082	mutex_unlock(&lock);
1083
1084	ib_unregister_mad_agent(port->agent);
1085	cleanup_port(port);
1086	flush_workqueue(sa_wq);
1087}
1088
1089static void sa_db_add_dev(struct ib_device *device)
1090{
1091	struct sa_db_device *dev;
1092	struct sa_db_port *port;
1093	int s, e, i, ret;
1094
1095	if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
1096		return;
1097
1098	if (device->node_type == RDMA_NODE_IB_SWITCH) {
1099		s = e = 0;
1100	} else {
1101		s = 1;
1102		e = device->phys_port_cnt;
1103	}
1104
1105	dev = kzalloc(sizeof *dev + (e - s + 1) * sizeof *port, GFP_KERNEL);
1106	if (!dev)
1107		return;
1108
1109	dev->start_port = s;
1110	dev->port_count = e - s + 1;
1111	dev->device = device;
1112	for (i = 0; i < dev->port_count; i++) {
1113		ret = init_port(dev, s + i);
1114		if (ret)
1115			goto err;
1116	}
1117
1118	ib_set_client_data(device, &sa_db_client, dev);
1119
1120	INIT_IB_EVENT_HANDLER(&dev->event_handler, device, handle_event);
1121
1122	mutex_lock(&lock);
1123	list_add_tail(&dev->list, &dev_list);
1124	refresh_dev_db(dev);
1125	mutex_unlock(&lock);
1126
1127	ib_register_event_handler(&dev->event_handler);
1128	return;
1129err:
1130	while (i--)
1131		destroy_port(&dev->port[i]);
1132	kfree(dev);
1133}
1134
1135static void sa_db_remove_dev(struct ib_device *device)
1136{
1137	struct sa_db_device *dev;
1138	int i;
1139
1140	dev = ib_get_client_data(device, &sa_db_client);
1141	if (!dev)
1142		return;
1143
1144	ib_unregister_event_handler(&dev->event_handler);
1145	flush_workqueue(sa_wq);
1146
1147	for (i = 0; i < dev->port_count; i++)
1148		destroy_port(&dev->port[i]);
1149
1150	mutex_lock(&lock);
1151	list_del(&dev->list);
1152	mutex_unlock(&lock);
1153
1154	kfree(dev);
1155}
1156
1157int sa_db_init(void)
1158{
1159	int ret;
1160
1161	rwlock_init(&rwlock);
1162	sa_wq = create_singlethread_workqueue("local_sa");
1163	if (!sa_wq)
1164		return -ENOMEM;
1165
1166	ib_sa_register_client(&sa_client);
1167	ret = ib_register_client(&sa_db_client);
1168	if (ret)
1169		goto err;
1170
1171	return 0;
1172
1173err:
1174	ib_sa_unregister_client(&sa_client);
1175	destroy_workqueue(sa_wq);
1176	return ret;
1177}
1178
1179void sa_db_cleanup(void)
1180{
1181	ib_unregister_client(&sa_db_client);
1182	ib_sa_unregister_client(&sa_client);
1183	destroy_workqueue(sa_wq);
1184}
1185