1/*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 1982, 1986, 1988, 1993
5 *	The Regents of the University of California.  All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 * 3. Neither the name of the University nor the names of its contributors
16 *    may be used to endorse or promote products derived from this software
17 *    without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 * SUCH DAMAGE.
30 *
31 *	@(#)if_ether.c	8.1 (Berkeley) 6/10/93
32 */
33
34/*
35 * Ethernet address resolution protocol.
36 * TODO:
37 *	add "inuse/lock" bit (or ref. count) along with valid bit
38 */
39
40#include <sys/cdefs.h>
41__FBSDID("$FreeBSD$");
42
43#include "opt_inet.h"
44
45#include <sys/param.h>
46#include <sys/eventhandler.h>
47#include <sys/kernel.h>
48#include <sys/lock.h>
49#include <sys/queue.h>
50#include <sys/sysctl.h>
51#include <sys/systm.h>
52#include <sys/mbuf.h>
53#include <sys/malloc.h>
54#include <sys/proc.h>
55#include <sys/rmlock.h>
56#include <sys/socket.h>
57#include <sys/syslog.h>
58
59#include <net/if.h>
60#include <net/if_var.h>
61#include <net/if_dl.h>
62#include <net/if_types.h>
63#include <net/netisr.h>
64#include <net/ethernet.h>
65#include <net/route.h>
66#include <net/route/nhop.h>
67#include <net/vnet.h>
68
69#include <netinet/in.h>
70#include <netinet/in_fib.h>
71#include <netinet/in_var.h>
72#include <net/if_llatbl.h>
73#include <netinet/if_ether.h>
74#ifdef INET
75#include <netinet/ip_carp.h>
76#endif
77
78#include <security/mac/mac_framework.h>
79
80#define SIN(s) ((const struct sockaddr_in *)(s))
81
82static struct timeval arp_lastlog;
83static int arp_curpps;
84static int arp_maxpps = 1;
85
86/* Simple ARP state machine */
87enum arp_llinfo_state {
88	ARP_LLINFO_INCOMPLETE = 0, /* No LLE data */
89	ARP_LLINFO_REACHABLE,	/* LLE is valid */
90	ARP_LLINFO_VERIFY,	/* LLE is valid, need refresh */
91	ARP_LLINFO_DELETED,	/* LLE is deleted */
92};
93
94SYSCTL_DECL(_net_link_ether);
95static SYSCTL_NODE(_net_link_ether, PF_INET, inet,
96    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
97    "");
98static SYSCTL_NODE(_net_link_ether, PF_ARP, arp,
99    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
100    "");
101
102/* timer values */
103VNET_DEFINE_STATIC(int, arpt_keep) = (20*60);	/* once resolved, good for 20
104						 * minutes */
105VNET_DEFINE_STATIC(int, arp_maxtries) = 5;
106VNET_DEFINE_STATIC(int, arp_proxyall) = 0;
107VNET_DEFINE_STATIC(int, arpt_down) = 20;	/* keep incomplete entries for
108						 * 20 seconds */
109VNET_DEFINE_STATIC(int, arpt_rexmit) = 1;	/* retransmit arp entries, sec*/
110VNET_PCPUSTAT_DEFINE(struct arpstat, arpstat);  /* ARP statistics, see if_arp.h */
111VNET_PCPUSTAT_SYSINIT(arpstat);
112
113#ifdef VIMAGE
114VNET_PCPUSTAT_SYSUNINIT(arpstat);
115#endif /* VIMAGE */
116
117VNET_DEFINE_STATIC(int, arp_maxhold) = 16;
118
119#define	V_arpt_keep		VNET(arpt_keep)
120#define	V_arpt_down		VNET(arpt_down)
121#define	V_arpt_rexmit		VNET(arpt_rexmit)
122#define	V_arp_maxtries		VNET(arp_maxtries)
123#define	V_arp_proxyall		VNET(arp_proxyall)
124#define	V_arp_maxhold		VNET(arp_maxhold)
125
126SYSCTL_INT(_net_link_ether_inet, OID_AUTO, max_age, CTLFLAG_VNET | CTLFLAG_RW,
127	&VNET_NAME(arpt_keep), 0,
128	"ARP entry lifetime in seconds");
129SYSCTL_INT(_net_link_ether_inet, OID_AUTO, maxtries, CTLFLAG_VNET | CTLFLAG_RW,
130	&VNET_NAME(arp_maxtries), 0,
131	"ARP resolution attempts before returning error");
132SYSCTL_INT(_net_link_ether_inet, OID_AUTO, proxyall, CTLFLAG_VNET | CTLFLAG_RW,
133	&VNET_NAME(arp_proxyall), 0,
134	"Enable proxy ARP for all suitable requests");
135SYSCTL_INT(_net_link_ether_inet, OID_AUTO, wait, CTLFLAG_VNET | CTLFLAG_RW,
136	&VNET_NAME(arpt_down), 0,
137	"Incomplete ARP entry lifetime in seconds");
138SYSCTL_VNET_PCPUSTAT(_net_link_ether_arp, OID_AUTO, stats, struct arpstat,
139    arpstat, "ARP statistics (struct arpstat, net/if_arp.h)");
140SYSCTL_INT(_net_link_ether_inet, OID_AUTO, maxhold, CTLFLAG_VNET | CTLFLAG_RW,
141	&VNET_NAME(arp_maxhold), 0,
142	"Number of packets to hold per ARP entry");
143SYSCTL_INT(_net_link_ether_inet, OID_AUTO, max_log_per_second,
144	CTLFLAG_RW, &arp_maxpps, 0,
145	"Maximum number of remotely triggered ARP messages that can be "
146	"logged per second");
147
148/*
149 * Due to the exponential backoff algorithm used for the interval between GARP
150 * retransmissions, the maximum number of retransmissions is limited for
151 * sanity. This limit corresponds to a maximum interval between retransmissions
152 * of 2^16 seconds ~= 18 hours.
153 *
154 * Making this limit more dynamic is more complicated than worthwhile,
155 * especially since sending out GARPs spaced days apart would be of little
156 * use. A maximum dynamic limit would look something like:
157 *
158 * const int max = fls(INT_MAX / hz) - 1;
159 */
160#define MAX_GARP_RETRANSMITS 16
161static int sysctl_garp_rexmit(SYSCTL_HANDLER_ARGS);
162static int garp_rexmit_count = 0; /* GARP retransmission setting. */
163
164SYSCTL_PROC(_net_link_ether_inet, OID_AUTO, garp_rexmit_count,
165    CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_MPSAFE,
166    &garp_rexmit_count, 0, sysctl_garp_rexmit, "I",
167    "Number of times to retransmit GARP packets;"
168    " 0 to disable, maximum of 16");
169
170VNET_DEFINE_STATIC(int, arp_log_level) = LOG_INFO;	/* Min. log(9) level. */
171#define	V_arp_log_level		VNET(arp_log_level)
172SYSCTL_INT(_net_link_ether_arp, OID_AUTO, log_level, CTLFLAG_VNET | CTLFLAG_RW,
173	&VNET_NAME(arp_log_level), 0,
174	"Minimum log(9) level for recording rate limited arp log messages. "
175	"The higher will be log more (emerg=0, info=6 (default), debug=7).");
176#define	ARP_LOG(pri, ...)	do {					\
177	if ((pri) <= V_arp_log_level &&					\
178	    ppsratecheck(&arp_lastlog, &arp_curpps, arp_maxpps))	\
179		log((pri), "arp: " __VA_ARGS__);			\
180} while (0)
181
182static void	arpintr(struct mbuf *);
183static void	arptimer(void *);
184#ifdef INET
185static void	in_arpinput(struct mbuf *);
186#endif
187
188static void arp_check_update_lle(struct arphdr *ah, struct in_addr isaddr,
189    struct ifnet *ifp, int bridged, struct llentry *la);
190static void arp_mark_lle_reachable(struct llentry *la);
191static void arp_iflladdr(void *arg __unused, struct ifnet *ifp);
192
193static eventhandler_tag iflladdr_tag;
194
195static const struct netisr_handler arp_nh = {
196	.nh_name = "arp",
197	.nh_handler = arpintr,
198	.nh_proto = NETISR_ARP,
199	.nh_policy = NETISR_POLICY_SOURCE,
200};
201
202/*
203 * Timeout routine.  Age arp_tab entries periodically.
204 */
205static void
206arptimer(void *arg)
207{
208	struct llentry *lle = (struct llentry *)arg;
209	struct ifnet *ifp;
210	int r_skip_req;
211
212	if (lle->la_flags & LLE_STATIC) {
213		return;
214	}
215	LLE_WLOCK(lle);
216	if (callout_pending(&lle->lle_timer)) {
217		/*
218		 * Here we are a bit odd here in the treatment of
219		 * active/pending. If the pending bit is set, it got
220		 * rescheduled before I ran. The active
221		 * bit we ignore, since if it was stopped
222		 * in ll_tablefree() and was currently running
223		 * it would have return 0 so the code would
224		 * not have deleted it since the callout could
225		 * not be stopped so we want to go through
226		 * with the delete here now. If the callout
227		 * was restarted, the pending bit will be back on and
228		 * we just want to bail since the callout_reset would
229		 * return 1 and our reference would have been removed
230		 * by arpresolve() below.
231		 */
232		LLE_WUNLOCK(lle);
233 		return;
234 	}
235	ifp = lle->lle_tbl->llt_ifp;
236	CURVNET_SET(ifp->if_vnet);
237
238	switch (lle->ln_state) {
239	case ARP_LLINFO_REACHABLE:
240
241		/*
242		 * Expiration time is approaching.
243		 * Let's try to refresh entry if it is still
244		 * in use.
245		 *
246		 * Set r_skip_req to get feedback from
247		 * fast path. Change state and re-schedule
248		 * ourselves.
249		 */
250		LLE_REQ_LOCK(lle);
251		lle->r_skip_req = 1;
252		LLE_REQ_UNLOCK(lle);
253		lle->ln_state = ARP_LLINFO_VERIFY;
254		callout_schedule(&lle->lle_timer, hz * V_arpt_rexmit);
255		LLE_WUNLOCK(lle);
256		CURVNET_RESTORE();
257		return;
258	case ARP_LLINFO_VERIFY:
259		LLE_REQ_LOCK(lle);
260		r_skip_req = lle->r_skip_req;
261		LLE_REQ_UNLOCK(lle);
262
263		if (r_skip_req == 0 && lle->la_preempt > 0) {
264			/* Entry was used, issue refresh request */
265			struct epoch_tracker et;
266			struct in_addr dst;
267
268			dst = lle->r_l3addr.addr4;
269			lle->la_preempt--;
270			callout_schedule(&lle->lle_timer, hz * V_arpt_rexmit);
271			LLE_WUNLOCK(lle);
272			NET_EPOCH_ENTER(et);
273			arprequest(ifp, NULL, &dst, NULL);
274			NET_EPOCH_EXIT(et);
275			CURVNET_RESTORE();
276			return;
277		}
278		/* Nothing happened. Reschedule if not too late */
279		if (lle->la_expire > time_uptime) {
280			callout_schedule(&lle->lle_timer, hz * V_arpt_rexmit);
281			LLE_WUNLOCK(lle);
282			CURVNET_RESTORE();
283			return;
284		}
285		break;
286	case ARP_LLINFO_INCOMPLETE:
287	case ARP_LLINFO_DELETED:
288		break;
289	}
290
291	if ((lle->la_flags & LLE_DELETED) == 0) {
292		int evt;
293
294		if (lle->la_flags & LLE_VALID)
295			evt = LLENTRY_EXPIRED;
296		else
297			evt = LLENTRY_TIMEDOUT;
298		EVENTHANDLER_INVOKE(lle_event, lle, evt);
299	}
300
301	callout_stop(&lle->lle_timer);
302
303	/* XXX: LOR avoidance. We still have ref on lle. */
304	LLE_WUNLOCK(lle);
305	IF_AFDATA_LOCK(ifp);
306	LLE_WLOCK(lle);
307
308	/* Guard against race with other llentry_free(). */
309	if (lle->la_flags & LLE_LINKED) {
310		LLE_REMREF(lle);
311		lltable_unlink_entry(lle->lle_tbl, lle);
312	}
313	IF_AFDATA_UNLOCK(ifp);
314
315	size_t pkts_dropped = llentry_free(lle);
316
317	ARPSTAT_ADD(dropped, pkts_dropped);
318	ARPSTAT_INC(timeouts);
319
320	CURVNET_RESTORE();
321}
322
323/*
324 * Stores link-layer header for @ifp in format suitable for if_output()
325 * into buffer @buf. Resulting header length is stored in @bufsize.
326 *
327 * Returns 0 on success.
328 */
329static int
330arp_fillheader(struct ifnet *ifp, struct arphdr *ah, int bcast, u_char *buf,
331    size_t *bufsize)
332{
333	struct if_encap_req ereq;
334	int error;
335
336	bzero(buf, *bufsize);
337	bzero(&ereq, sizeof(ereq));
338	ereq.buf = buf;
339	ereq.bufsize = *bufsize;
340	ereq.rtype = IFENCAP_LL;
341	ereq.family = AF_ARP;
342	ereq.lladdr = ar_tha(ah);
343	ereq.hdata = (u_char *)ah;
344	if (bcast)
345		ereq.flags = IFENCAP_FLAG_BROADCAST;
346	error = ifp->if_requestencap(ifp, &ereq);
347	if (error == 0)
348		*bufsize = ereq.bufsize;
349
350	return (error);
351}
352
353/*
354 * Broadcast an ARP request. Caller specifies:
355 *	- arp header source ip address
356 *	- arp header target ip address
357 *	- arp header source ethernet address
358 */
359static int
360arprequest_internal(struct ifnet *ifp, const struct in_addr *sip,
361    const struct in_addr *tip, u_char *enaddr)
362{
363	struct mbuf *m;
364	struct arphdr *ah;
365	struct sockaddr sa;
366	u_char *carpaddr = NULL;
367	uint8_t linkhdr[LLE_MAX_LINKHDR];
368	size_t linkhdrsize;
369	struct route ro;
370	int error;
371
372	NET_EPOCH_ASSERT();
373
374	if (sip == NULL) {
375		/*
376		 * The caller did not supply a source address, try to find
377		 * a compatible one among those assigned to this interface.
378		 */
379		struct ifaddr *ifa;
380
381		CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
382			if (ifa->ifa_addr->sa_family != AF_INET)
383				continue;
384
385			if (ifa->ifa_carp) {
386				if ((*carp_iamatch_p)(ifa, &carpaddr) == 0)
387					continue;
388				sip = &IA_SIN(ifa)->sin_addr;
389			} else {
390				carpaddr = NULL;
391				sip = &IA_SIN(ifa)->sin_addr;
392			}
393
394			if (0 == ((sip->s_addr ^ tip->s_addr) &
395			    IA_MASKSIN(ifa)->sin_addr.s_addr))
396				break;  /* found it. */
397		}
398		if (sip == NULL) {
399			printf("%s: cannot find matching address\n", __func__);
400			return (EADDRNOTAVAIL);
401		}
402	}
403	if (enaddr == NULL)
404		enaddr = carpaddr ? carpaddr : (u_char *)IF_LLADDR(ifp);
405
406	if ((m = m_gethdr(M_NOWAIT, MT_DATA)) == NULL)
407		return (ENOMEM);
408	m->m_len = sizeof(*ah) + 2 * sizeof(struct in_addr) +
409		2 * ifp->if_addrlen;
410	m->m_pkthdr.len = m->m_len;
411	M_ALIGN(m, m->m_len);
412	ah = mtod(m, struct arphdr *);
413	bzero((caddr_t)ah, m->m_len);
414#ifdef MAC
415	mac_netinet_arp_send(ifp, m);
416#endif
417	ah->ar_pro = htons(ETHERTYPE_IP);
418	ah->ar_hln = ifp->if_addrlen;		/* hardware address length */
419	ah->ar_pln = sizeof(struct in_addr);	/* protocol address length */
420	ah->ar_op = htons(ARPOP_REQUEST);
421	bcopy(enaddr, ar_sha(ah), ah->ar_hln);
422	bcopy(sip, ar_spa(ah), ah->ar_pln);
423	bcopy(tip, ar_tpa(ah), ah->ar_pln);
424	sa.sa_family = AF_ARP;
425	sa.sa_len = 2;
426
427	/* Calculate link header for sending frame */
428	bzero(&ro, sizeof(ro));
429	linkhdrsize = sizeof(linkhdr);
430	error = arp_fillheader(ifp, ah, 1, linkhdr, &linkhdrsize);
431	if (error != 0 && error != EAFNOSUPPORT) {
432		ARP_LOG(LOG_ERR, "Failed to calculate ARP header on %s: %d\n",
433		    if_name(ifp), error);
434		return (error);
435	}
436
437	ro.ro_prepend = linkhdr;
438	ro.ro_plen = linkhdrsize;
439	ro.ro_flags = 0;
440
441	m->m_flags |= M_BCAST;
442	m_clrprotoflags(m);	/* Avoid confusing lower layers. */
443	error = (*ifp->if_output)(ifp, m, &sa, &ro);
444	ARPSTAT_INC(txrequests);
445	if (error) {
446		ARPSTAT_INC(txerrors);
447		ARP_LOG(LOG_DEBUG, "Failed to send ARP packet on %s: %d\n",
448		    if_name(ifp), error);
449	}
450	return (error);
451}
452
453void
454arprequest(struct ifnet *ifp, const struct in_addr *sip,
455    const struct in_addr *tip, u_char *enaddr)
456{
457
458	(void) arprequest_internal(ifp, sip, tip, enaddr);
459}
460
461/*
462 * Resolve an IP address into an ethernet address - heavy version.
463 * Used internally by arpresolve().
464 * We have already checked that we can't use an existing lle without
465 * modification so we have to acquire an LLE_EXCLUSIVE lle lock.
466 *
467 * On success, desten and pflags are filled in and the function returns 0;
468 * If the packet must be held pending resolution, we return EWOULDBLOCK
469 * On other errors, we return the corresponding error code.
470 * Note that m_freem() handles NULL.
471 */
472static int
473arpresolve_full(struct ifnet *ifp, int is_gw, int flags, struct mbuf *m,
474	const struct sockaddr *dst, u_char *desten, uint32_t *pflags,
475	struct llentry **plle)
476{
477	struct llentry *la = NULL, *la_tmp;
478	struct mbuf *curr = NULL;
479	struct mbuf *next = NULL;
480	int error, renew;
481	char *lladdr;
482	int ll_len;
483
484	NET_EPOCH_ASSERT();
485
486	if (pflags != NULL)
487		*pflags = 0;
488	if (plle != NULL)
489		*plle = NULL;
490
491	if ((flags & LLE_CREATE) == 0)
492		la = lla_lookup(LLTABLE(ifp), LLE_EXCLUSIVE, dst);
493	if (la == NULL && (ifp->if_flags & (IFF_NOARP | IFF_STATICARP)) == 0) {
494		la = lltable_alloc_entry(LLTABLE(ifp), 0, dst);
495		if (la == NULL) {
496			char addrbuf[INET_ADDRSTRLEN];
497
498			log(LOG_DEBUG,
499			    "arpresolve: can't allocate llinfo for %s on %s\n",
500			    inet_ntoa_r(SIN(dst)->sin_addr, addrbuf),
501			    if_name(ifp));
502			m_freem(m);
503			return (EINVAL);
504		}
505
506		IF_AFDATA_WLOCK(ifp);
507		LLE_WLOCK(la);
508		la_tmp = lla_lookup(LLTABLE(ifp), LLE_EXCLUSIVE, dst);
509		/* Prefer ANY existing lle over newly-created one */
510		if (la_tmp == NULL)
511			lltable_link_entry(LLTABLE(ifp), la);
512		IF_AFDATA_WUNLOCK(ifp);
513		if (la_tmp != NULL) {
514			lltable_free_entry(LLTABLE(ifp), la);
515			la = la_tmp;
516		}
517	}
518	if (la == NULL) {
519		m_freem(m);
520		return (EINVAL);
521	}
522
523	if ((la->la_flags & LLE_VALID) &&
524	    ((la->la_flags & LLE_STATIC) || la->la_expire > time_uptime)) {
525		if (flags & LLE_ADDRONLY) {
526			lladdr = la->ll_addr;
527			ll_len = ifp->if_addrlen;
528		} else {
529			lladdr = la->r_linkdata;
530			ll_len = la->r_hdrlen;
531		}
532		bcopy(lladdr, desten, ll_len);
533
534		/* Notify LLE code that the entry was used by datapath */
535		llentry_mark_used(la);
536		if (pflags != NULL)
537			*pflags = la->la_flags & (LLE_VALID|LLE_IFADDR);
538		if (plle) {
539			LLE_ADDREF(la);
540			*plle = la;
541		}
542		LLE_WUNLOCK(la);
543		return (0);
544	}
545
546	renew = (la->la_asked == 0 || la->la_expire != time_uptime);
547	/*
548	 * There is an arptab entry, but no ethernet address
549	 * response yet.  Add the mbuf to the list, dropping
550	 * the oldest packet if we have exceeded the system
551	 * setting.
552	 */
553	if (m != NULL) {
554		if (la->la_numheld >= V_arp_maxhold) {
555			if (la->la_hold != NULL) {
556				next = la->la_hold->m_nextpkt;
557				m_freem(la->la_hold);
558				la->la_hold = next;
559				la->la_numheld--;
560				ARPSTAT_INC(dropped);
561			}
562		}
563		if (la->la_hold != NULL) {
564			curr = la->la_hold;
565			while (curr->m_nextpkt != NULL)
566				curr = curr->m_nextpkt;
567			curr->m_nextpkt = m;
568		} else
569			la->la_hold = m;
570		la->la_numheld++;
571	}
572	/*
573	 * Return EWOULDBLOCK if we have tried less than arp_maxtries. It
574	 * will be masked by ether_output(). Return EHOSTDOWN/EHOSTUNREACH
575	 * if we have already sent arp_maxtries ARP requests. Retransmit the
576	 * ARP request, but not faster than one request per second.
577	 */
578	if (la->la_asked < V_arp_maxtries)
579		error = EWOULDBLOCK;	/* First request. */
580	else
581		error = is_gw != 0 ? EHOSTUNREACH : EHOSTDOWN;
582
583	if (renew) {
584		int canceled, e;
585
586		LLE_ADDREF(la);
587		la->la_expire = time_uptime;
588		canceled = callout_reset(&la->lle_timer, hz * V_arpt_down,
589		    arptimer, la);
590		if (canceled)
591			LLE_REMREF(la);
592		la->la_asked++;
593		LLE_WUNLOCK(la);
594		e = arprequest_internal(ifp, NULL, &SIN(dst)->sin_addr, NULL);
595		/*
596		 * Only overwrite 'error' in case of error; in case of success
597		 * the proper return value was already set above.
598		 */
599		if (e != 0)
600			return (e);
601		return (error);
602	}
603
604	LLE_WUNLOCK(la);
605	return (error);
606}
607
608/*
609 * Lookups link header based on an IP address.
610 * On input:
611 *    ifp is the interface we use
612 *    is_gw != 0 if @dst represents gateway to some destination
613 *    m is the mbuf. May be NULL if we don't have a packet.
614 *    dst is the next hop,
615 *    desten is the storage to put LL header.
616 *    flags returns subset of lle flags: LLE_VALID | LLE_IFADDR
617 *
618 * On success, full/partial link header and flags are filled in and
619 * the function returns 0.
620 * If the packet must be held pending resolution, we return EWOULDBLOCK
621 * On other errors, we return the corresponding error code.
622 * Note that m_freem() handles NULL.
623 */
624int
625arpresolve(struct ifnet *ifp, int is_gw, struct mbuf *m,
626	const struct sockaddr *dst, u_char *desten, uint32_t *pflags,
627	struct llentry **plle)
628{
629	struct llentry *la = NULL;
630
631	NET_EPOCH_ASSERT();
632
633	if (pflags != NULL)
634		*pflags = 0;
635	if (plle != NULL)
636		*plle = NULL;
637
638	if (m != NULL) {
639		if (m->m_flags & M_BCAST) {
640			/* broadcast */
641			(void)memcpy(desten,
642			    ifp->if_broadcastaddr, ifp->if_addrlen);
643			return (0);
644		}
645		if (m->m_flags & M_MCAST) {
646			/* multicast */
647			ETHER_MAP_IP_MULTICAST(&SIN(dst)->sin_addr, desten);
648			return (0);
649		}
650	}
651
652	la = lla_lookup(LLTABLE(ifp), plle ? LLE_EXCLUSIVE : LLE_UNLOCKED, dst);
653	if (la != NULL && (la->r_flags & RLLE_VALID) != 0) {
654		/* Entry found, let's copy lle info */
655		bcopy(la->r_linkdata, desten, la->r_hdrlen);
656		if (pflags != NULL)
657			*pflags = LLE_VALID | (la->r_flags & RLLE_IFADDR);
658		/* Notify the LLE handling code that the entry was used. */
659		llentry_mark_used(la);
660		if (plle) {
661			LLE_ADDREF(la);
662			*plle = la;
663			LLE_WUNLOCK(la);
664		}
665		return (0);
666	}
667	if (plle && la)
668		LLE_WUNLOCK(la);
669
670	return (arpresolve_full(ifp, is_gw, la == NULL ? LLE_CREATE : 0, m, dst,
671	    desten, pflags, plle));
672}
673
674/*
675 * Common length and type checks are done here,
676 * then the protocol-specific routine is called.
677 */
678static void
679arpintr(struct mbuf *m)
680{
681	struct arphdr *ar;
682	struct ifnet *ifp;
683	char *layer;
684	int hlen;
685
686	ifp = m->m_pkthdr.rcvif;
687
688	if (m->m_len < sizeof(struct arphdr) &&
689	    ((m = m_pullup(m, sizeof(struct arphdr))) == NULL)) {
690		ARP_LOG(LOG_NOTICE, "packet with short header received on %s\n",
691		    if_name(ifp));
692		return;
693	}
694	ar = mtod(m, struct arphdr *);
695
696	/* Check if length is sufficient */
697	if (m->m_len <  arphdr_len(ar)) {
698		m = m_pullup(m, arphdr_len(ar));
699		if (m == NULL) {
700			ARP_LOG(LOG_NOTICE, "short packet received on %s\n",
701			    if_name(ifp));
702			return;
703		}
704		ar = mtod(m, struct arphdr *);
705	}
706
707	hlen = 0;
708	layer = "";
709	switch (ntohs(ar->ar_hrd)) {
710	case ARPHRD_ETHER:
711		hlen = ETHER_ADDR_LEN; /* RFC 826 */
712		layer = "ethernet";
713		break;
714	case ARPHRD_INFINIBAND:
715		hlen = 20;	/* RFC 4391, INFINIBAND_ALEN */
716		layer = "infiniband";
717		break;
718	case ARPHRD_IEEE1394:
719		hlen = 0; /* SHALL be 16 */ /* RFC 2734 */
720		layer = "firewire";
721
722		/*
723		 * Restrict too long hardware addresses.
724		 * Currently we are capable of handling 20-byte
725		 * addresses ( sizeof(lle->ll_addr) )
726		 */
727		if (ar->ar_hln >= 20)
728			hlen = 16;
729		break;
730	default:
731		ARP_LOG(LOG_NOTICE,
732		    "packet with unknown hardware format 0x%02d received on "
733		    "%s\n", ntohs(ar->ar_hrd), if_name(ifp));
734		m_freem(m);
735		return;
736	}
737
738	if (hlen != 0 && hlen != ar->ar_hln) {
739		ARP_LOG(LOG_NOTICE,
740		    "packet with invalid %s address length %d received on %s\n",
741		    layer, ar->ar_hln, if_name(ifp));
742		m_freem(m);
743		return;
744	}
745
746	ARPSTAT_INC(received);
747	switch (ntohs(ar->ar_pro)) {
748#ifdef INET
749	case ETHERTYPE_IP:
750		in_arpinput(m);
751		return;
752#endif
753	}
754	m_freem(m);
755}
756
757#ifdef INET
758/*
759 * ARP for Internet protocols on 10 Mb/s Ethernet.
760 * Algorithm is that given in RFC 826.
761 * In addition, a sanity check is performed on the sender
762 * protocol address, to catch impersonators.
763 * We no longer handle negotiations for use of trailer protocol:
764 * Formerly, ARP replied for protocol type ETHERTYPE_TRAIL sent
765 * along with IP replies if we wanted trailers sent to us,
766 * and also sent them in response to IP replies.
767 * This allowed either end to announce the desire to receive
768 * trailer packets.
769 * We no longer reply to requests for ETHERTYPE_TRAIL protocol either,
770 * but formerly didn't normally send requests.
771 */
772static int log_arp_wrong_iface = 1;
773static int log_arp_movements = 1;
774static int log_arp_permanent_modify = 1;
775static int allow_multicast = 0;
776
777SYSCTL_INT(_net_link_ether_inet, OID_AUTO, log_arp_wrong_iface, CTLFLAG_RW,
778	&log_arp_wrong_iface, 0,
779	"log arp packets arriving on the wrong interface");
780SYSCTL_INT(_net_link_ether_inet, OID_AUTO, log_arp_movements, CTLFLAG_RW,
781	&log_arp_movements, 0,
782	"log arp replies from MACs different than the one in the cache");
783SYSCTL_INT(_net_link_ether_inet, OID_AUTO, log_arp_permanent_modify, CTLFLAG_RW,
784	&log_arp_permanent_modify, 0,
785	"log arp replies from MACs different than the one in the permanent arp entry");
786SYSCTL_INT(_net_link_ether_inet, OID_AUTO, allow_multicast, CTLFLAG_RW,
787	&allow_multicast, 0, "accept multicast addresses");
788
789static void
790in_arpinput(struct mbuf *m)
791{
792	struct rm_priotracker in_ifa_tracker;
793	struct arphdr *ah;
794	struct ifnet *ifp = m->m_pkthdr.rcvif;
795	struct llentry *la = NULL, *la_tmp;
796	struct ifaddr *ifa;
797	struct in_ifaddr *ia;
798	struct sockaddr sa;
799	struct in_addr isaddr, itaddr, myaddr;
800	u_int8_t *enaddr = NULL;
801	int op;
802	int bridged = 0, is_bridge = 0;
803	int carped;
804	struct sockaddr_in sin;
805	struct sockaddr *dst;
806	struct nhop_object *nh;
807	uint8_t linkhdr[LLE_MAX_LINKHDR];
808	struct route ro;
809	size_t linkhdrsize;
810	int lladdr_off;
811	int error;
812	char addrbuf[INET_ADDRSTRLEN];
813
814	NET_EPOCH_ASSERT();
815
816	sin.sin_len = sizeof(struct sockaddr_in);
817	sin.sin_family = AF_INET;
818	sin.sin_addr.s_addr = 0;
819
820	if (ifp->if_bridge)
821		bridged = 1;
822	if (ifp->if_type == IFT_BRIDGE)
823		is_bridge = 1;
824
825	/*
826	 * We already have checked that mbuf contains enough contiguous data
827	 * to hold entire arp message according to the arp header.
828	 */
829	ah = mtod(m, struct arphdr *);
830
831	/*
832	 * ARP is only for IPv4 so we can reject packets with
833	 * a protocol length not equal to an IPv4 address.
834	 */
835	if (ah->ar_pln != sizeof(struct in_addr)) {
836		ARP_LOG(LOG_NOTICE, "requested protocol length != %zu\n",
837		    sizeof(struct in_addr));
838		goto drop;
839	}
840
841	if (allow_multicast == 0 && ETHER_IS_MULTICAST(ar_sha(ah))) {
842		ARP_LOG(LOG_NOTICE, "%*D is multicast\n",
843		    ifp->if_addrlen, (u_char *)ar_sha(ah), ":");
844		goto drop;
845	}
846
847	op = ntohs(ah->ar_op);
848	(void)memcpy(&isaddr, ar_spa(ah), sizeof (isaddr));
849	(void)memcpy(&itaddr, ar_tpa(ah), sizeof (itaddr));
850
851	if (op == ARPOP_REPLY)
852		ARPSTAT_INC(rxreplies);
853
854	/*
855	 * For a bridge, we want to check the address irrespective
856	 * of the receive interface. (This will change slightly
857	 * when we have clusters of interfaces).
858	 */
859	IN_IFADDR_RLOCK(&in_ifa_tracker);
860	LIST_FOREACH(ia, INADDR_HASH(itaddr.s_addr), ia_hash) {
861		if (((bridged && ia->ia_ifp->if_bridge == ifp->if_bridge) ||
862		    ia->ia_ifp == ifp) &&
863		    itaddr.s_addr == ia->ia_addr.sin_addr.s_addr &&
864		    (ia->ia_ifa.ifa_carp == NULL ||
865		    (*carp_iamatch_p)(&ia->ia_ifa, &enaddr))) {
866			ifa_ref(&ia->ia_ifa);
867			IN_IFADDR_RUNLOCK(&in_ifa_tracker);
868			goto match;
869		}
870	}
871	LIST_FOREACH(ia, INADDR_HASH(isaddr.s_addr), ia_hash)
872		if (((bridged && ia->ia_ifp->if_bridge == ifp->if_bridge) ||
873		    ia->ia_ifp == ifp) &&
874		    isaddr.s_addr == ia->ia_addr.sin_addr.s_addr) {
875			ifa_ref(&ia->ia_ifa);
876			IN_IFADDR_RUNLOCK(&in_ifa_tracker);
877			goto match;
878		}
879
880#define BDG_MEMBER_MATCHES_ARP(addr, ifp, ia)				\
881  (ia->ia_ifp->if_bridge == ifp->if_softc &&				\
882  !bcmp(IF_LLADDR(ia->ia_ifp), IF_LLADDR(ifp), ifp->if_addrlen) &&	\
883  addr == ia->ia_addr.sin_addr.s_addr)
884	/*
885	 * Check the case when bridge shares its MAC address with
886	 * some of its children, so packets are claimed by bridge
887	 * itself (bridge_input() does it first), but they are really
888	 * meant to be destined to the bridge member.
889	 */
890	if (is_bridge) {
891		LIST_FOREACH(ia, INADDR_HASH(itaddr.s_addr), ia_hash) {
892			if (BDG_MEMBER_MATCHES_ARP(itaddr.s_addr, ifp, ia)) {
893				ifa_ref(&ia->ia_ifa);
894				ifp = ia->ia_ifp;
895				IN_IFADDR_RUNLOCK(&in_ifa_tracker);
896				goto match;
897			}
898		}
899	}
900#undef BDG_MEMBER_MATCHES_ARP
901	IN_IFADDR_RUNLOCK(&in_ifa_tracker);
902
903	/*
904	 * No match, use the first inet address on the receive interface
905	 * as a dummy address for the rest of the function.
906	 */
907	CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link)
908		if (ifa->ifa_addr->sa_family == AF_INET &&
909		    (ifa->ifa_carp == NULL ||
910		    (*carp_iamatch_p)(ifa, &enaddr))) {
911			ia = ifatoia(ifa);
912			ifa_ref(ifa);
913			goto match;
914		}
915
916	/*
917	 * If bridging, fall back to using any inet address.
918	 */
919	IN_IFADDR_RLOCK(&in_ifa_tracker);
920	if (!bridged || (ia = CK_STAILQ_FIRST(&V_in_ifaddrhead)) == NULL) {
921		IN_IFADDR_RUNLOCK(&in_ifa_tracker);
922		goto drop;
923	}
924	ifa_ref(&ia->ia_ifa);
925	IN_IFADDR_RUNLOCK(&in_ifa_tracker);
926match:
927	if (!enaddr)
928		enaddr = (u_int8_t *)IF_LLADDR(ifp);
929	carped = (ia->ia_ifa.ifa_carp != NULL);
930	myaddr = ia->ia_addr.sin_addr;
931	ifa_free(&ia->ia_ifa);
932	if (!bcmp(ar_sha(ah), enaddr, ifp->if_addrlen))
933		goto drop;	/* it's from me, ignore it. */
934	if (!bcmp(ar_sha(ah), ifp->if_broadcastaddr, ifp->if_addrlen)) {
935		ARP_LOG(LOG_NOTICE, "link address is broadcast for IP address "
936		    "%s!\n", inet_ntoa_r(isaddr, addrbuf));
937		goto drop;
938	}
939
940	if (ifp->if_addrlen != ah->ar_hln) {
941		ARP_LOG(LOG_WARNING, "from %*D: addr len: new %d, "
942		    "i/f %d (ignored)\n", ifp->if_addrlen,
943		    (u_char *) ar_sha(ah), ":", ah->ar_hln,
944		    ifp->if_addrlen);
945		goto drop;
946	}
947
948	/*
949	 * Warn if another host is using the same IP address, but only if the
950	 * IP address isn't 0.0.0.0, which is used for DHCP only, in which
951	 * case we suppress the warning to avoid false positive complaints of
952	 * potential misconfiguration.
953	 */
954	if (!bridged && !carped && isaddr.s_addr == myaddr.s_addr &&
955	    myaddr.s_addr != 0) {
956		ARP_LOG(LOG_ERR, "%*D is using my IP address %s on %s!\n",
957		   ifp->if_addrlen, (u_char *)ar_sha(ah), ":",
958		   inet_ntoa_r(isaddr, addrbuf), ifp->if_xname);
959		itaddr = myaddr;
960		ARPSTAT_INC(dupips);
961		goto reply;
962	}
963	if (ifp->if_flags & IFF_STATICARP)
964		goto reply;
965
966	bzero(&sin, sizeof(sin));
967	sin.sin_len = sizeof(struct sockaddr_in);
968	sin.sin_family = AF_INET;
969	sin.sin_addr = isaddr;
970	dst = (struct sockaddr *)&sin;
971	la = lla_lookup(LLTABLE(ifp), LLE_EXCLUSIVE, dst);
972	if (la != NULL)
973		arp_check_update_lle(ah, isaddr, ifp, bridged, la);
974	else if (itaddr.s_addr == myaddr.s_addr) {
975		/*
976		 * Request/reply to our address, but no lle exists yet.
977		 * Calculate full link prepend to use in lle.
978		 */
979		linkhdrsize = sizeof(linkhdr);
980		if (lltable_calc_llheader(ifp, AF_INET, ar_sha(ah), linkhdr,
981		    &linkhdrsize, &lladdr_off) != 0)
982			goto reply;
983
984		/* Allocate new entry */
985		la = lltable_alloc_entry(LLTABLE(ifp), 0, dst);
986		if (la == NULL) {
987			/*
988			 * lle creation may fail if source address belongs
989			 * to non-directly connected subnet. However, we
990			 * will try to answer the request instead of dropping
991			 * frame.
992			 */
993			goto reply;
994		}
995		lltable_set_entry_addr(ifp, la, linkhdr, linkhdrsize,
996		    lladdr_off);
997
998		IF_AFDATA_WLOCK(ifp);
999		LLE_WLOCK(la);
1000		la_tmp = lla_lookup(LLTABLE(ifp), LLE_EXCLUSIVE, dst);
1001
1002		/*
1003		 * Check if lle still does not exists.
1004		 * If it does, that means that we either
1005		 * 1) have configured it explicitly, via
1006		 * 1a) 'arp -s' static entry or
1007		 * 1b) interface address static record
1008		 * or
1009		 * 2) it was the result of sending first packet to-host
1010		 * or
1011		 * 3) it was another arp reply packet we handled in
1012		 * different thread.
1013		 *
1014		 * In all cases except 3) we definitely need to prefer
1015		 * existing lle. For the sake of simplicity, prefer any
1016		 * existing lle over newly-create one.
1017		 */
1018		if (la_tmp == NULL)
1019			lltable_link_entry(LLTABLE(ifp), la);
1020		IF_AFDATA_WUNLOCK(ifp);
1021
1022		if (la_tmp == NULL) {
1023			arp_mark_lle_reachable(la);
1024			LLE_WUNLOCK(la);
1025		} else {
1026			/* Free newly-create entry and handle packet */
1027			lltable_free_entry(LLTABLE(ifp), la);
1028			la = la_tmp;
1029			la_tmp = NULL;
1030			arp_check_update_lle(ah, isaddr, ifp, bridged, la);
1031			/* arp_check_update_lle() returns @la unlocked */
1032		}
1033		la = NULL;
1034	}
1035reply:
1036	if (op != ARPOP_REQUEST)
1037		goto drop;
1038	ARPSTAT_INC(rxrequests);
1039
1040	if (itaddr.s_addr == myaddr.s_addr) {
1041		/* Shortcut.. the receiving interface is the target. */
1042		(void)memcpy(ar_tha(ah), ar_sha(ah), ah->ar_hln);
1043		(void)memcpy(ar_sha(ah), enaddr, ah->ar_hln);
1044	} else {
1045		/*
1046		 * Destination address is not ours. Check if
1047		 * proxyarp entry exists or proxyarp is turned on globally.
1048		 */
1049		struct llentry *lle;
1050
1051		sin.sin_addr = itaddr;
1052		lle = lla_lookup(LLTABLE(ifp), 0, (struct sockaddr *)&sin);
1053
1054		if ((lle != NULL) && (lle->la_flags & LLE_PUB)) {
1055			(void)memcpy(ar_tha(ah), ar_sha(ah), ah->ar_hln);
1056			(void)memcpy(ar_sha(ah), lle->ll_addr, ah->ar_hln);
1057			LLE_RUNLOCK(lle);
1058		} else {
1059			if (lle != NULL)
1060				LLE_RUNLOCK(lle);
1061
1062			if (!V_arp_proxyall)
1063				goto drop;
1064
1065			NET_EPOCH_ASSERT();
1066			nh = fib4_lookup(ifp->if_fib, itaddr, 0, 0, 0);
1067			if (nh == NULL)
1068				goto drop;
1069
1070			/*
1071			 * Don't send proxies for nodes on the same interface
1072			 * as this one came out of, or we'll get into a fight
1073			 * over who claims what Ether address.
1074			 */
1075			if (nh->nh_ifp == ifp)
1076				goto drop;
1077
1078			(void)memcpy(ar_tha(ah), ar_sha(ah), ah->ar_hln);
1079			(void)memcpy(ar_sha(ah), enaddr, ah->ar_hln);
1080
1081			/*
1082			 * Also check that the node which sent the ARP packet
1083			 * is on the interface we expect it to be on. This
1084			 * avoids ARP chaos if an interface is connected to the
1085			 * wrong network.
1086			 */
1087
1088			nh = fib4_lookup(ifp->if_fib, isaddr, 0, 0, 0);
1089			if (nh == NULL)
1090				goto drop;
1091			if (nh->nh_ifp != ifp) {
1092				ARP_LOG(LOG_INFO, "proxy: ignoring request"
1093				    " from %s via %s\n",
1094				    inet_ntoa_r(isaddr, addrbuf),
1095				    ifp->if_xname);
1096				goto drop;
1097			}
1098
1099#ifdef DEBUG_PROXY
1100			printf("arp: proxying for %s\n",
1101			    inet_ntoa_r(itaddr, addrbuf));
1102#endif
1103		}
1104	}
1105
1106	if (itaddr.s_addr == myaddr.s_addr &&
1107	    IN_LINKLOCAL(ntohl(itaddr.s_addr))) {
1108		/* RFC 3927 link-local IPv4; always reply by broadcast. */
1109#ifdef DEBUG_LINKLOCAL
1110		printf("arp: sending reply for link-local addr %s\n",
1111		    inet_ntoa_r(itaddr, addrbuf));
1112#endif
1113		m->m_flags |= M_BCAST;
1114		m->m_flags &= ~M_MCAST;
1115	} else {
1116		/* default behaviour; never reply by broadcast. */
1117		m->m_flags &= ~(M_BCAST|M_MCAST);
1118	}
1119	(void)memcpy(ar_tpa(ah), ar_spa(ah), ah->ar_pln);
1120	(void)memcpy(ar_spa(ah), &itaddr, ah->ar_pln);
1121	ah->ar_op = htons(ARPOP_REPLY);
1122	ah->ar_pro = htons(ETHERTYPE_IP); /* let's be sure! */
1123	m->m_len = sizeof(*ah) + (2 * ah->ar_pln) + (2 * ah->ar_hln);
1124	m->m_pkthdr.len = m->m_len;
1125	m->m_pkthdr.rcvif = NULL;
1126	sa.sa_family = AF_ARP;
1127	sa.sa_len = 2;
1128
1129	/* Calculate link header for sending frame */
1130	bzero(&ro, sizeof(ro));
1131	linkhdrsize = sizeof(linkhdr);
1132	error = arp_fillheader(ifp, ah, 0, linkhdr, &linkhdrsize);
1133
1134	/*
1135	 * arp_fillheader() may fail due to lack of support inside encap request
1136	 * routing. This is not necessary an error, AF_ARP can/should be handled
1137	 * by if_output().
1138	 */
1139	if (error != 0 && error != EAFNOSUPPORT) {
1140		ARP_LOG(LOG_ERR, "Failed to calculate ARP header on %s: %d\n",
1141		    if_name(ifp), error);
1142		return;
1143	}
1144
1145	ro.ro_prepend = linkhdr;
1146	ro.ro_plen = linkhdrsize;
1147	ro.ro_flags = 0;
1148
1149	m_clrprotoflags(m);	/* Avoid confusing lower layers. */
1150	(*ifp->if_output)(ifp, m, &sa, &ro);
1151	ARPSTAT_INC(txreplies);
1152	return;
1153
1154drop:
1155	m_freem(m);
1156}
1157#endif
1158
1159/*
1160 * Checks received arp data against existing @la.
1161 * Updates lle state/performs notification if necessary.
1162 */
1163static void
1164arp_check_update_lle(struct arphdr *ah, struct in_addr isaddr, struct ifnet *ifp,
1165    int bridged, struct llentry *la)
1166{
1167	struct sockaddr sa;
1168	struct mbuf *m_hold, *m_hold_next;
1169	uint8_t linkhdr[LLE_MAX_LINKHDR];
1170	size_t linkhdrsize;
1171	int lladdr_off;
1172	char addrbuf[INET_ADDRSTRLEN];
1173
1174	LLE_WLOCK_ASSERT(la);
1175
1176	/* the following is not an error when doing bridging */
1177	if (!bridged && la->lle_tbl->llt_ifp != ifp) {
1178		if (log_arp_wrong_iface)
1179			ARP_LOG(LOG_WARNING, "%s is on %s "
1180			    "but got reply from %*D on %s\n",
1181			    inet_ntoa_r(isaddr, addrbuf),
1182			    la->lle_tbl->llt_ifp->if_xname,
1183			    ifp->if_addrlen, (u_char *)ar_sha(ah), ":",
1184			    ifp->if_xname);
1185		LLE_WUNLOCK(la);
1186		return;
1187	}
1188	if ((la->la_flags & LLE_VALID) &&
1189	    bcmp(ar_sha(ah), la->ll_addr, ifp->if_addrlen)) {
1190		if (la->la_flags & LLE_STATIC) {
1191			LLE_WUNLOCK(la);
1192			if (log_arp_permanent_modify)
1193				ARP_LOG(LOG_ERR,
1194				    "%*D attempts to modify "
1195				    "permanent entry for %s on %s\n",
1196				    ifp->if_addrlen,
1197				    (u_char *)ar_sha(ah), ":",
1198				    inet_ntoa_r(isaddr, addrbuf),
1199				    ifp->if_xname);
1200			return;
1201		}
1202		if (log_arp_movements) {
1203			ARP_LOG(LOG_INFO, "%s moved from %*D "
1204			    "to %*D on %s\n",
1205			    inet_ntoa_r(isaddr, addrbuf),
1206			    ifp->if_addrlen,
1207			    (u_char *)la->ll_addr, ":",
1208			    ifp->if_addrlen, (u_char *)ar_sha(ah), ":",
1209			    ifp->if_xname);
1210		}
1211	}
1212
1213	/* Calculate full link prepend to use in lle */
1214	linkhdrsize = sizeof(linkhdr);
1215	if (lltable_calc_llheader(ifp, AF_INET, ar_sha(ah), linkhdr,
1216	    &linkhdrsize, &lladdr_off) != 0)
1217		return;
1218
1219	/* Check if something has changed */
1220	if (memcmp(la->r_linkdata, linkhdr, linkhdrsize) != 0 ||
1221	    (la->la_flags & LLE_VALID) == 0) {
1222		/* Try to perform LLE update */
1223		if (lltable_try_set_entry_addr(ifp, la, linkhdr, linkhdrsize,
1224		    lladdr_off) == 0)
1225			return;
1226
1227		/* Clear fast path feedback request if set */
1228		la->r_skip_req = 0;
1229	}
1230
1231	arp_mark_lle_reachable(la);
1232
1233	/*
1234	 * The packets are all freed within the call to the output
1235	 * routine.
1236	 *
1237	 * NB: The lock MUST be released before the call to the
1238	 * output routine.
1239	 */
1240	if (la->la_hold != NULL) {
1241		m_hold = la->la_hold;
1242		la->la_hold = NULL;
1243		la->la_numheld = 0;
1244		lltable_fill_sa_entry(la, &sa);
1245		LLE_WUNLOCK(la);
1246		for (; m_hold != NULL; m_hold = m_hold_next) {
1247			m_hold_next = m_hold->m_nextpkt;
1248			m_hold->m_nextpkt = NULL;
1249			/* Avoid confusing lower layers. */
1250			m_clrprotoflags(m_hold);
1251			(*ifp->if_output)(ifp, m_hold, &sa, NULL);
1252		}
1253	} else
1254		LLE_WUNLOCK(la);
1255}
1256
1257static void
1258arp_mark_lle_reachable(struct llentry *la)
1259{
1260	int canceled, wtime;
1261
1262	LLE_WLOCK_ASSERT(la);
1263
1264	la->ln_state = ARP_LLINFO_REACHABLE;
1265	EVENTHANDLER_INVOKE(lle_event, la, LLENTRY_RESOLVED);
1266
1267	if (!(la->la_flags & LLE_STATIC)) {
1268		LLE_ADDREF(la);
1269		la->la_expire = time_uptime + V_arpt_keep;
1270		wtime = V_arpt_keep - V_arp_maxtries * V_arpt_rexmit;
1271		if (wtime < 0)
1272			wtime = V_arpt_keep;
1273		canceled = callout_reset(&la->lle_timer,
1274		    hz * wtime, arptimer, la);
1275		if (canceled)
1276			LLE_REMREF(la);
1277	}
1278	la->la_asked = 0;
1279	la->la_preempt = V_arp_maxtries;
1280}
1281
1282/*
1283 * Add permanent link-layer record for given interface address.
1284 */
1285static __noinline void
1286arp_add_ifa_lle(struct ifnet *ifp, const struct sockaddr *dst)
1287{
1288	struct llentry *lle, *lle_tmp;
1289
1290	/*
1291	 * Interface address LLE record is considered static
1292	 * because kernel code relies on LLE_STATIC flag to check
1293	 * if these entries can be rewriten by arp updates.
1294	 */
1295	lle = lltable_alloc_entry(LLTABLE(ifp), LLE_IFADDR | LLE_STATIC, dst);
1296	if (lle == NULL) {
1297		log(LOG_INFO, "arp_ifinit: cannot create arp "
1298		    "entry for interface address\n");
1299		return;
1300	}
1301
1302	IF_AFDATA_WLOCK(ifp);
1303	LLE_WLOCK(lle);
1304	/* Unlink any entry if exists */
1305	lle_tmp = lla_lookup(LLTABLE(ifp), LLE_EXCLUSIVE, dst);
1306	if (lle_tmp != NULL)
1307		lltable_unlink_entry(LLTABLE(ifp), lle_tmp);
1308
1309	lltable_link_entry(LLTABLE(ifp), lle);
1310	IF_AFDATA_WUNLOCK(ifp);
1311
1312	if (lle_tmp != NULL)
1313		EVENTHANDLER_INVOKE(lle_event, lle_tmp, LLENTRY_EXPIRED);
1314
1315	EVENTHANDLER_INVOKE(lle_event, lle, LLENTRY_RESOLVED);
1316	LLE_WUNLOCK(lle);
1317	if (lle_tmp != NULL)
1318		lltable_free_entry(LLTABLE(ifp), lle_tmp);
1319}
1320
1321/*
1322 * Handle the garp_rexmit_count. Like sysctl_handle_int(), but limits the range
1323 * of valid values.
1324 */
1325static int
1326sysctl_garp_rexmit(SYSCTL_HANDLER_ARGS)
1327{
1328	int error;
1329	int rexmit_count = *(int *)arg1;
1330
1331	error = sysctl_handle_int(oidp, &rexmit_count, 0, req);
1332
1333	/* Enforce limits on any new value that may have been set. */
1334	if (!error && req->newptr) {
1335		/* A new value was set. */
1336		if (rexmit_count < 0) {
1337			rexmit_count = 0;
1338		} else if (rexmit_count > MAX_GARP_RETRANSMITS) {
1339			rexmit_count = MAX_GARP_RETRANSMITS;
1340		}
1341		*(int *)arg1 = rexmit_count;
1342	}
1343
1344	return (error);
1345}
1346
1347/*
1348 * Retransmit a Gratuitous ARP (GARP) and, if necessary, schedule a callout to
1349 * retransmit it again. A pending callout owns a reference to the ifa.
1350 */
1351static void
1352garp_rexmit(void *arg)
1353{
1354	struct in_ifaddr *ia = arg;
1355
1356	if (callout_pending(&ia->ia_garp_timer) ||
1357	    !callout_active(&ia->ia_garp_timer)) {
1358		IF_ADDR_WUNLOCK(ia->ia_ifa.ifa_ifp);
1359		ifa_free(&ia->ia_ifa);
1360		return;
1361	}
1362
1363	CURVNET_SET(ia->ia_ifa.ifa_ifp->if_vnet);
1364
1365	/*
1366	 * Drop lock while the ARP request is generated.
1367	 */
1368	IF_ADDR_WUNLOCK(ia->ia_ifa.ifa_ifp);
1369
1370	arprequest(ia->ia_ifa.ifa_ifp, &IA_SIN(ia)->sin_addr,
1371	    &IA_SIN(ia)->sin_addr, IF_LLADDR(ia->ia_ifa.ifa_ifp));
1372
1373	/*
1374	 * Increment the count of retransmissions. If the count has reached the
1375	 * maximum value, stop sending the GARP packets. Otherwise, schedule
1376	 * the callout to retransmit another GARP packet.
1377	 */
1378	++ia->ia_garp_count;
1379	if (ia->ia_garp_count >= garp_rexmit_count) {
1380		ifa_free(&ia->ia_ifa);
1381	} else {
1382		int rescheduled;
1383		IF_ADDR_WLOCK(ia->ia_ifa.ifa_ifp);
1384		rescheduled = callout_reset(&ia->ia_garp_timer,
1385		    (1 << ia->ia_garp_count) * hz,
1386		    garp_rexmit, ia);
1387		IF_ADDR_WUNLOCK(ia->ia_ifa.ifa_ifp);
1388		if (rescheduled) {
1389			ifa_free(&ia->ia_ifa);
1390		}
1391	}
1392
1393	CURVNET_RESTORE();
1394}
1395
1396/*
1397 * Start the GARP retransmit timer.
1398 *
1399 * A single GARP is always transmitted when an IPv4 address is added
1400 * to an interface and that is usually sufficient. However, in some
1401 * circumstances, such as when a shared address is passed between
1402 * cluster nodes, this single GARP may occasionally be dropped or
1403 * lost. This can lead to neighbors on the network link working with a
1404 * stale ARP cache and sending packets destined for that address to
1405 * the node that previously owned the address, which may not respond.
1406 *
1407 * To avoid this situation, GARP retransmits can be enabled by setting
1408 * the net.link.ether.inet.garp_rexmit_count sysctl to a value greater
1409 * than zero. The setting represents the maximum number of
1410 * retransmissions. The interval between retransmissions is calculated
1411 * using an exponential backoff algorithm, doubling each time, so the
1412 * retransmission intervals are: {1, 2, 4, 8, 16, ...} (seconds).
1413 */
1414static void
1415garp_timer_start(struct ifaddr *ifa)
1416{
1417	struct in_ifaddr *ia = (struct in_ifaddr *) ifa;
1418
1419	IF_ADDR_WLOCK(ia->ia_ifa.ifa_ifp);
1420	ia->ia_garp_count = 0;
1421	if (callout_reset(&ia->ia_garp_timer, (1 << ia->ia_garp_count) * hz,
1422	    garp_rexmit, ia) == 0) {
1423		ifa_ref(ifa);
1424	}
1425	IF_ADDR_WUNLOCK(ia->ia_ifa.ifa_ifp);
1426}
1427
1428void
1429arp_ifinit(struct ifnet *ifp, struct ifaddr *ifa)
1430{
1431	struct epoch_tracker et;
1432	const struct sockaddr_in *dst_in;
1433	const struct sockaddr *dst;
1434
1435	if (ifa->ifa_carp != NULL)
1436		return;
1437
1438	dst = ifa->ifa_addr;
1439	dst_in = (const struct sockaddr_in *)dst;
1440
1441	if (ntohl(dst_in->sin_addr.s_addr) == INADDR_ANY)
1442		return;
1443	NET_EPOCH_ENTER(et);
1444	arp_announce_ifaddr(ifp, dst_in->sin_addr, IF_LLADDR(ifp));
1445	NET_EPOCH_EXIT(et);
1446	if (garp_rexmit_count > 0) {
1447		garp_timer_start(ifa);
1448	}
1449
1450	arp_add_ifa_lle(ifp, dst);
1451}
1452
1453void
1454arp_announce_ifaddr(struct ifnet *ifp, struct in_addr addr, u_char *enaddr)
1455{
1456
1457	if (ntohl(addr.s_addr) != INADDR_ANY)
1458		arprequest(ifp, &addr, &addr, enaddr);
1459}
1460
1461/*
1462 * Sends gratuitous ARPs for each ifaddr to notify other
1463 * nodes about the address change.
1464 */
1465static __noinline void
1466arp_handle_ifllchange(struct ifnet *ifp)
1467{
1468	struct ifaddr *ifa;
1469
1470	CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
1471		if (ifa->ifa_addr->sa_family == AF_INET)
1472			arp_ifinit(ifp, ifa);
1473	}
1474}
1475
1476/*
1477 * A handler for interface link layer address change event.
1478 */
1479static void
1480arp_iflladdr(void *arg __unused, struct ifnet *ifp)
1481{
1482	/* if_bridge can update its lladdr during if_vmove(), after we've done
1483	 * if_detach_internal()/dom_ifdetach(). */
1484	if (ifp->if_afdata[AF_INET] == NULL)
1485		return;
1486
1487	lltable_update_ifaddr(LLTABLE(ifp));
1488
1489	if ((ifp->if_flags & IFF_UP) != 0)
1490		arp_handle_ifllchange(ifp);
1491}
1492
1493static void
1494vnet_arp_init(void)
1495{
1496
1497	if (IS_DEFAULT_VNET(curvnet)) {
1498		netisr_register(&arp_nh);
1499		iflladdr_tag = EVENTHANDLER_REGISTER(iflladdr_event,
1500		    arp_iflladdr, NULL, EVENTHANDLER_PRI_ANY);
1501	}
1502#ifdef VIMAGE
1503	else
1504		netisr_register_vnet(&arp_nh);
1505#endif
1506}
1507VNET_SYSINIT(vnet_arp_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_SECOND,
1508    vnet_arp_init, 0);
1509
1510#ifdef VIMAGE
1511/*
1512 * We have to unregister ARP along with IP otherwise we risk doing INADDR_HASH
1513 * lookups after destroying the hash.  Ideally this would go on SI_ORDER_3.5.
1514 */
1515static void
1516vnet_arp_destroy(__unused void *arg)
1517{
1518
1519	netisr_unregister_vnet(&arp_nh);
1520}
1521VNET_SYSUNINIT(vnet_arp_uninit, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD,
1522    vnet_arp_destroy, NULL);
1523#endif
1524