cxgb_l2t.c revision 183292
1/**************************************************************************
2
3Copyright (c) 2007, Chelsio Inc.
4All rights reserved.
5
6Redistribution and use in source and binary forms, with or without
7modification, are permitted provided that the following conditions are met:
8
9 1. Redistributions of source code must retain the above copyright notice,
10    this list of conditions and the following disclaimer.
11
12 2. Neither the name of the Chelsio Corporation nor the names of its
13    contributors may be used to endorse or promote products derived from
14    this software without specific prior written permission.
15
16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26POSSIBILITY OF SUCH DAMAGE.
27
28***************************************************************************/
29
30#include <sys/cdefs.h>
31__FBSDID("$FreeBSD: head/sys/dev/cxgb/ulp/tom/cxgb_l2t.c 183292 2008-09-23 03:16:54Z kmacy $");
32
33#include <sys/param.h>
34#include <sys/systm.h>
35#include <sys/kernel.h>
36#include <sys/module.h>
37#include <sys/bus.h>
38#include <sys/lock.h>
39#include <sys/mutex.h>
40#if __FreeBSD_version > 700000
41#include <sys/rwlock.h>
42#endif
43
44#include <sys/socket.h>
45#include <net/if.h>
46#include <net/ethernet.h>
47#include <net/if_vlan_var.h>
48#include <net/if_dl.h>
49#include <net/route.h>
50#include <netinet/in.h>
51#include <netinet/if_ether.h>
52
53#include <cxgb_include.h>
54
55#define VLAN_NONE 0xfff
56#define SDL(s) ((struct sockaddr_dl *)s)
57#define RT_ENADDR(sa)  ((u_char *)LLADDR(SDL((sa))))
58#define rt_expire rt_rmx.rmx_expire
59
60struct llinfo_arp {
61        struct  callout la_timer;
62        struct  rtentry *la_rt;
63        struct  mbuf *la_hold;  /* last packet until resolved/timeout */
64        u_short la_preempt;     /* countdown for pre-expiry arps */
65        u_short la_asked;       /* # requests sent */
66};
67
68/*
69 * Module locking notes:  There is a RW lock protecting the L2 table as a
70 * whole plus a spinlock per L2T entry.  Entry lookups and allocations happen
71 * under the protection of the table lock, individual entry changes happen
72 * while holding that entry's spinlock.  The table lock nests outside the
73 * entry locks.  Allocations of new entries take the table lock as writers so
74 * no other lookups can happen while allocating new entries.  Entry updates
75 * take the table lock as readers so multiple entries can be updated in
76 * parallel.  An L2T entry can be dropped by decrementing its reference count
77 * and therefore can happen in parallel with entry allocation but no entry
78 * can change state or increment its ref count during allocation as both of
79 * these perform lookups.
80 */
81
82static inline unsigned int
83vlan_prio(const struct l2t_entry *e)
84{
85	return e->vlan >> 13;
86}
87
88static inline unsigned int
89arp_hash(u32 key, int ifindex, const struct l2t_data *d)
90{
91	return jhash_2words(key, ifindex, 0) & (d->nentries - 1);
92}
93
94static inline void
95neigh_replace(struct l2t_entry *e, struct rtentry *rt)
96{
97	RT_LOCK(rt);
98	RT_ADDREF(rt);
99	RT_UNLOCK(rt);
100
101	if (e->neigh)
102		RTFREE(e->neigh);
103	e->neigh = rt;
104}
105
106/*
107 * Set up an L2T entry and send any packets waiting in the arp queue.  The
108 * supplied mbuf is used for the CPL_L2T_WRITE_REQ.  Must be called with the
109 * entry locked.
110 */
111static int
112setup_l2e_send_pending(struct t3cdev *dev, struct mbuf *m,
113    struct l2t_entry *e)
114{
115	struct cpl_l2t_write_req *req;
116
117	if (!m) {
118		if ((m = m_gethdr(M_NOWAIT, MT_DATA)) == NULL)
119		    return (ENOMEM);
120	}
121	/*
122	 * XXX MH_ALIGN
123	 */
124	req = mtod(m, struct cpl_l2t_write_req *);
125	m->m_pkthdr.len = m->m_len = sizeof(*req);
126
127	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
128	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_L2T_WRITE_REQ, e->idx));
129	req->params = htonl(V_L2T_W_IDX(e->idx) | V_L2T_W_IFF(e->smt_idx) |
130			    V_L2T_W_VLAN(e->vlan & EVL_VLID_MASK) |
131			    V_L2T_W_PRIO(vlan_prio(e)));
132
133	memcpy(req->dst_mac, e->dmac, sizeof(req->dst_mac));
134	m_set_priority(m, CPL_PRIORITY_CONTROL);
135	cxgb_ofld_send(dev, m);
136	while (e->arpq_head) {
137		m = e->arpq_head;
138		e->arpq_head = m->m_next;
139		m->m_next = NULL;
140		cxgb_ofld_send(dev, m);
141	}
142	e->arpq_tail = NULL;
143	e->state = L2T_STATE_VALID;
144
145	return 0;
146}
147
148/*
149 * Add a packet to the an L2T entry's queue of packets awaiting resolution.
150 * Must be called with the entry's lock held.
151 */
152static inline void
153arpq_enqueue(struct l2t_entry *e, struct mbuf *m)
154{
155	m->m_next = NULL;
156	if (e->arpq_head)
157		e->arpq_tail->m_next = m;
158	else
159		e->arpq_head = m;
160	e->arpq_tail = m;
161}
162
163int
164t3_l2t_send_slow(struct t3cdev *dev, struct mbuf *m, struct l2t_entry *e)
165{
166	struct rtentry *rt =  e->neigh;
167	struct sockaddr_in sin;
168
169	bzero(&sin, sizeof(struct sockaddr_in));
170	sin.sin_family = AF_INET;
171	sin.sin_len = sizeof(struct sockaddr_in);
172	sin.sin_addr.s_addr = e->addr;
173
174	CTR2(KTR_CXGB, "send slow on rt=%p eaddr=0x%08x\n", rt, e->addr);
175again:
176	switch (e->state) {
177	case L2T_STATE_STALE:     /* entry is stale, kick off revalidation */
178		arpresolve(rt->rt_ifp, rt, NULL,
179		     (struct sockaddr *)&sin, e->dmac);
180		mtx_lock(&e->lock);
181		if (e->state == L2T_STATE_STALE)
182			e->state = L2T_STATE_VALID;
183		mtx_unlock(&e->lock);
184	case L2T_STATE_VALID:     /* fast-path, send the packet on */
185		return cxgb_ofld_send(dev, m);
186	case L2T_STATE_RESOLVING:
187		mtx_lock(&e->lock);
188		if (e->state != L2T_STATE_RESOLVING) { // ARP already completed
189			mtx_unlock(&e->lock);
190			goto again;
191		}
192		arpq_enqueue(e, m);
193		mtx_unlock(&e->lock);
194		/*
195		 * Only the first packet added to the arpq should kick off
196		 * resolution.  However, because the m_gethdr below can fail,
197		 * we allow each packet added to the arpq to retry resolution
198		 * as a way of recovering from transient memory exhaustion.
199		 * A better way would be to use a work request to retry L2T
200		 * entries when there's no memory.
201		 */
202		if (arpresolve(rt->rt_ifp, rt, NULL,
203		     (struct sockaddr *)&sin, e->dmac) == 0) {
204			CTR6(KTR_CXGB, "mac=%x:%x:%x:%x:%x:%x\n",
205			    e->dmac[0], e->dmac[1], e->dmac[2], e->dmac[3], e->dmac[4], e->dmac[5]);
206
207			if ((m = m_gethdr(M_NOWAIT, MT_DATA)) == NULL)
208				return (ENOMEM);
209
210			mtx_lock(&e->lock);
211			if (e->arpq_head)
212				setup_l2e_send_pending(dev, m, e);
213			else
214				m_freem(m);
215			mtx_unlock(&e->lock);
216		}
217	}
218	return 0;
219}
220
221void
222t3_l2t_send_event(struct t3cdev *dev, struct l2t_entry *e)
223{
224	struct rtentry *rt;
225	struct mbuf *m0;
226	struct sockaddr_in sin;
227	sin.sin_family = AF_INET;
228	sin.sin_len = sizeof(struct sockaddr_in);
229	sin.sin_addr.s_addr = e->addr;
230
231	if ((m0 = m_gethdr(M_NOWAIT, MT_DATA)) == NULL)
232		return;
233
234	rt = e->neigh;
235again:
236	switch (e->state) {
237	case L2T_STATE_STALE:     /* entry is stale, kick off revalidation */
238		arpresolve(rt->rt_ifp, rt, NULL,
239		     (struct sockaddr *)&sin, e->dmac);
240		mtx_lock(&e->lock);
241		if (e->state == L2T_STATE_STALE) {
242			e->state = L2T_STATE_VALID;
243		}
244		mtx_unlock(&e->lock);
245		return;
246	case L2T_STATE_VALID:     /* fast-path, send the packet on */
247		return;
248	case L2T_STATE_RESOLVING:
249		mtx_lock(&e->lock);
250		if (e->state != L2T_STATE_RESOLVING) { // ARP already completed
251			mtx_unlock(&e->lock);
252			goto again;
253		}
254		mtx_unlock(&e->lock);
255
256		/*
257		 * Only the first packet added to the arpq should kick off
258		 * resolution.  However, because the alloc_skb below can fail,
259		 * we allow each packet added to the arpq to retry resolution
260		 * as a way of recovering from transient memory exhaustion.
261		 * A better way would be to use a work request to retry L2T
262		 * entries when there's no memory.
263		 */
264		arpresolve(rt->rt_ifp, rt, NULL,
265		    (struct sockaddr *)&sin, e->dmac);
266
267	}
268	return;
269}
270/*
271 * Allocate a free L2T entry.  Must be called with l2t_data.lock held.
272 */
273static struct l2t_entry *
274alloc_l2e(struct l2t_data *d)
275{
276	struct l2t_entry *end, *e, **p;
277
278	if (!atomic_load_acq_int(&d->nfree))
279		return NULL;
280
281	/* there's definitely a free entry */
282	for (e = d->rover, end = &d->l2tab[d->nentries]; e != end; ++e)
283		if (atomic_load_acq_int(&e->refcnt) == 0)
284			goto found;
285
286	for (e = &d->l2tab[1]; atomic_load_acq_int(&e->refcnt); ++e) ;
287found:
288	d->rover = e + 1;
289	atomic_add_int(&d->nfree, -1);
290
291	/*
292	 * The entry we found may be an inactive entry that is
293	 * presently in the hash table.  We need to remove it.
294	 */
295	if (e->state != L2T_STATE_UNUSED) {
296		int hash = arp_hash(e->addr, e->ifindex, d);
297
298		for (p = &d->l2tab[hash].first; *p; p = &(*p)->next)
299			if (*p == e) {
300				*p = e->next;
301				break;
302			}
303		e->state = L2T_STATE_UNUSED;
304	}
305
306	return e;
307}
308
309/*
310 * Called when an L2T entry has no more users.  The entry is left in the hash
311 * table since it is likely to be reused but we also bump nfree to indicate
312 * that the entry can be reallocated for a different neighbor.  We also drop
313 * the existing neighbor reference in case the neighbor is going away and is
314 * waiting on our reference.
315 *
316 * Because entries can be reallocated to other neighbors once their ref count
317 * drops to 0 we need to take the entry's lock to avoid races with a new
318 * incarnation.
319 */
320void
321t3_l2e_free(struct l2t_data *d, struct l2t_entry *e)
322{
323	struct rtentry *rt = NULL;
324
325	mtx_lock(&e->lock);
326	if (atomic_load_acq_int(&e->refcnt) == 0) {  /* hasn't been recycled */
327		rt = e->neigh;
328		e->neigh = NULL;
329	}
330
331	mtx_unlock(&e->lock);
332	atomic_add_int(&d->nfree, 1);
333	if (rt)
334		RTFREE(rt);
335}
336
337
338/*
339 * Update an L2T entry that was previously used for the same next hop as neigh.
340 * Must be called with softirqs disabled.
341 */
342static inline void
343reuse_entry(struct l2t_entry *e, struct rtentry *neigh)
344{
345	struct llinfo_arp *la;
346
347	la = (struct llinfo_arp *)neigh->rt_llinfo;
348
349	mtx_lock(&e->lock);                /* avoid race with t3_l2t_free */
350	if (neigh != e->neigh)
351		neigh_replace(e, neigh);
352
353	if (memcmp(e->dmac, RT_ENADDR(neigh->rt_gateway), sizeof(e->dmac)) ||
354	    (neigh->rt_expire > time_uptime))
355		e->state = L2T_STATE_RESOLVING;
356	else if (la->la_hold == NULL)
357		e->state = L2T_STATE_VALID;
358	else
359		e->state = L2T_STATE_STALE;
360	mtx_unlock(&e->lock);
361}
362
363struct l2t_entry *
364t3_l2t_get(struct t3cdev *dev, struct rtentry *neigh, struct ifnet *ifp,
365	struct sockaddr *sa)
366{
367	struct l2t_entry *e;
368	struct l2t_data *d = L2DATA(dev);
369	u32 addr = ((struct sockaddr_in *)sa)->sin_addr.s_addr;
370	int ifidx = neigh->rt_ifp->if_index;
371	int hash = arp_hash(addr, ifidx, d);
372	unsigned int smt_idx = ((struct port_info *)ifp->if_softc)->port_id;
373
374	rw_wlock(&d->lock);
375	for (e = d->l2tab[hash].first; e; e = e->next)
376		if (e->addr == addr && e->ifindex == ifidx &&
377		    e->smt_idx == smt_idx) {
378			l2t_hold(d, e);
379			if (atomic_load_acq_int(&e->refcnt) == 1)
380				reuse_entry(e, neigh);
381			goto done;
382		}
383
384	/* Need to allocate a new entry */
385	e = alloc_l2e(d);
386	if (e) {
387		mtx_lock(&e->lock);          /* avoid race with t3_l2t_free */
388		e->next = d->l2tab[hash].first;
389		d->l2tab[hash].first = e;
390		rw_wunlock(&d->lock);
391
392		e->state = L2T_STATE_RESOLVING;
393		e->addr = addr;
394		e->ifindex = ifidx;
395		e->smt_idx = smt_idx;
396		atomic_store_rel_int(&e->refcnt, 1);
397		e->neigh = NULL;
398
399
400		neigh_replace(e, neigh);
401#ifdef notyet
402		/*
403		 * XXX need to add accessor function for vlan tag
404		 */
405		if (neigh->rt_ifp->if_vlantrunk)
406			e->vlan = VLAN_DEV_INFO(neigh->dev)->vlan_id;
407		else
408#endif
409			e->vlan = VLAN_NONE;
410		mtx_unlock(&e->lock);
411
412		return (e);
413	}
414
415done:
416	rw_wunlock(&d->lock);
417	return e;
418}
419
420/*
421 * Called when address resolution fails for an L2T entry to handle packets
422 * on the arpq head.  If a packet specifies a failure handler it is invoked,
423 * otherwise the packets is sent to the TOE.
424 *
425 * XXX: maybe we should abandon the latter behavior and just require a failure
426 * handler.
427 */
428static void
429handle_failed_resolution(struct t3cdev *dev, struct mbuf *arpq)
430{
431
432	while (arpq) {
433		struct mbuf *m = arpq;
434#ifdef notyet
435		struct l2t_mbuf_cb *cb = L2T_MBUF_CB(m);
436#endif
437		arpq = m->m_next;
438		m->m_next = NULL;
439#ifdef notyet
440		if (cb->arp_failure_handler)
441			cb->arp_failure_handler(dev, m);
442		else
443#endif
444			cxgb_ofld_send(dev, m);
445	}
446
447}
448
449void
450t3_l2t_update(struct t3cdev *dev, struct rtentry *neigh,
451    uint8_t *enaddr, struct sockaddr *sa)
452{
453	struct l2t_entry *e;
454	struct mbuf *arpq = NULL;
455	struct l2t_data *d = L2DATA(dev);
456	u32 addr = *(u32 *) &((struct sockaddr_in *)sa)->sin_addr;
457	int ifidx = neigh->rt_ifp->if_index;
458	int hash = arp_hash(addr, ifidx, d);
459	struct llinfo_arp *la;
460
461	rw_rlock(&d->lock);
462	for (e = d->l2tab[hash].first; e; e = e->next)
463		if (e->addr == addr && e->ifindex == ifidx) {
464			mtx_lock(&e->lock);
465			goto found;
466		}
467	rw_runlock(&d->lock);
468	CTR1(KTR_CXGB, "t3_l2t_update: addr=0x%08x not found", addr);
469	return;
470
471found:
472	printf("found 0x%08x\n", addr);
473
474	rw_runlock(&d->lock);
475	memcpy(e->dmac, enaddr, ETHER_ADDR_LEN);
476	printf("mac=%x:%x:%x:%x:%x:%x\n",
477	    e->dmac[0], e->dmac[1], e->dmac[2], e->dmac[3], e->dmac[4], e->dmac[5]);
478
479	if (atomic_load_acq_int(&e->refcnt)) {
480		if (neigh != e->neigh)
481			neigh_replace(e, neigh);
482
483		la = (struct llinfo_arp *)neigh->rt_llinfo;
484		if (e->state == L2T_STATE_RESOLVING) {
485
486			if (la->la_asked >= 5 /* arp_maxtries */) {
487				arpq = e->arpq_head;
488				e->arpq_head = e->arpq_tail = NULL;
489			} else
490				setup_l2e_send_pending(dev, NULL, e);
491		} else {
492			e->state = L2T_STATE_VALID;
493			if (memcmp(e->dmac, RT_ENADDR(neigh->rt_gateway), 6))
494				setup_l2e_send_pending(dev, NULL, e);
495		}
496	}
497	mtx_unlock(&e->lock);
498
499	if (arpq)
500		handle_failed_resolution(dev, arpq);
501}
502
503struct l2t_data *
504t3_init_l2t(unsigned int l2t_capacity)
505{
506	struct l2t_data *d;
507	int i, size = sizeof(*d) + l2t_capacity * sizeof(struct l2t_entry);
508
509	d = cxgb_alloc_mem(size);
510	if (!d)
511		return NULL;
512
513	d->nentries = l2t_capacity;
514	d->rover = &d->l2tab[1];	/* entry 0 is not used */
515	atomic_store_rel_int(&d->nfree, l2t_capacity - 1);
516	rw_init(&d->lock, "L2T");
517
518	for (i = 0; i < l2t_capacity; ++i) {
519		d->l2tab[i].idx = i;
520		d->l2tab[i].state = L2T_STATE_UNUSED;
521		mtx_init(&d->l2tab[i].lock, "L2TAB", NULL, MTX_DEF);
522		atomic_store_rel_int(&d->l2tab[i].refcnt, 0);
523	}
524	return d;
525}
526
527void
528t3_free_l2t(struct l2t_data *d)
529{
530	int i;
531
532	rw_destroy(&d->lock);
533	for (i = 0; i < d->nentries; ++i)
534		mtx_destroy(&d->l2tab[i].lock);
535
536	cxgb_free_mem(d);
537}
538