1/*
2 * Copyright (c) 2011-2013 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29#include <sys/param.h>
30#include <sys/systm.h>
31#include <sys/sysctl.h>
32#include <sys/mbuf.h>
33#include <sys/mcache.h>
34#include <sys/socket.h>
35#include <sys/socketvar.h>
36#include <net/if_types.h>
37#include <net/route.h>
38#include <netinet/in.h>
39#include <netinet/in_systm.h>
40#include <net/if.h>
41#include <net/dlil.h>
42#include <netinet/ip.h>
43#include <netinet/ip_var.h>
44#include <netinet/in_var.h>
45#include <netinet/tcp.h>
46#include <netinet/tcp_seq.h>
47#include <netinet/tcpip.h>
48#include <netinet/tcp_var.h>
49#include <netinet/tcp_lro.h>
50#include <netinet/lro_ext.h>
51#include <kern/locks.h>
52
53unsigned int lrocount = 0; /* A counter used for debugging only */
54unsigned int lro_seq_outoforder = 0; /* Counter for debugging */
55unsigned int lro_seq_mismatch = 0; /* Counter for debugging */
56unsigned int lro_flushes = 0; /* Counter for tracking number of flushes */
57unsigned int lro_single_flushes = 0;
58unsigned int lro_double_flushes = 0;
59unsigned int lro_good_flushes = 0;
60
61unsigned int coalesc_sz = LRO_MX_COALESCE_PKTS;
62SYSCTL_INT(_net_inet_tcp, OID_AUTO, lro_sz, CTLFLAG_RW | CTLFLAG_LOCKED,
63		&coalesc_sz, 0, "Max coalescing size");
64
65unsigned int coalesc_time = LRO_MX_TIME_TO_BUFFER;
66SYSCTL_INT(_net_inet_tcp, OID_AUTO, lro_time, CTLFLAG_RW | CTLFLAG_LOCKED,
67		&coalesc_time, 0, "Max coalescing time");
68
69struct lro_flow lro_flow_list[TCP_LRO_NUM_FLOWS];
70
71char lro_flow_map[TCP_LRO_FLOW_MAP];
72
73static lck_attr_t *tcp_lro_mtx_attr = NULL;		/* mutex attributes */
74static lck_grp_t *tcp_lro_mtx_grp = NULL;		/* mutex group */
75static lck_grp_attr_t *tcp_lro_mtx_grp_attr = NULL;	/* mutex group attrs */
76decl_lck_mtx_data( ,tcp_lro_lock);	/* Used to synchronize updates */
77
78unsigned int lro_byte_count = 0;
79
80uint64_t lro_deadline = 0; /* LRO's sense of time - protected by tcp_lro_lock */
81uint32_t lro_timer_set = 0;
82
83/* Some LRO stats */
84u_int32_t lro_pkt_count = 0; /* Number of packets encountered in an LRO period */
85thread_call_t tcp_lro_timer;
86
87extern u_int32_t kipf_count;
88
89static void	tcp_lro_timer_proc(void*, void*);
90static void	lro_update_stats(struct mbuf*);
91static void	lro_update_flush_stats(struct mbuf *);
92static void	tcp_lro_flush_flows(void);
93static void	tcp_lro_sched_timer(uint64_t);
94static void	lro_proto_input(struct mbuf *);
95
96static struct mbuf *lro_tcp_xsum_validate(struct mbuf*,  struct ip *,
97				struct tcphdr*);
98static struct mbuf *tcp_lro_process_pkt(struct mbuf*, struct ip*, struct tcphdr*,
99				int);
100
101void
102tcp_lro_init(void)
103{
104	int i;
105
106	bzero(lro_flow_list, sizeof (struct lro_flow) * TCP_LRO_NUM_FLOWS);
107	for (i = 0; i < TCP_LRO_FLOW_MAP; i++) {
108		lro_flow_map[i] = TCP_LRO_FLOW_UNINIT;
109	}
110
111	/*
112	 * allocate lock group attribute, group and attribute for tcp_lro_lock
113	 */
114	tcp_lro_mtx_grp_attr = lck_grp_attr_alloc_init();
115	tcp_lro_mtx_grp = lck_grp_alloc_init("tcplro", tcp_lro_mtx_grp_attr);
116	tcp_lro_mtx_attr = lck_attr_alloc_init();
117	lck_mtx_init(&tcp_lro_lock, tcp_lro_mtx_grp, tcp_lro_mtx_attr);
118
119	tcp_lro_timer = thread_call_allocate(tcp_lro_timer_proc, NULL);
120	if (tcp_lro_timer == NULL) {
121		panic_plain("%s: unable to allocate lro timer", __func__);
122	}
123
124	return;
125}
126
127static int
128tcp_lro_matching_tuple(struct ip* ip_hdr, struct tcphdr *tcp_hdr, int *hash,
129			int *flow_id )
130{
131	struct lro_flow *flow;
132	tcp_seq seqnum;
133	unsigned int off = 0;
134	int payload_len = 0;
135
136	*hash = LRO_HASH(ip_hdr->ip_src.s_addr, ip_hdr->ip_dst.s_addr,
137		tcp_hdr->th_sport, tcp_hdr->th_dport, (TCP_LRO_FLOW_MAP - 1));
138
139	*flow_id = lro_flow_map[*hash];
140	if (*flow_id == TCP_LRO_FLOW_NOTFOUND) {
141		return TCP_LRO_NAN;
142	}
143
144	seqnum = tcp_hdr->th_seq;
145	off = tcp_hdr->th_off << 2;
146	payload_len = ip_hdr->ip_len - off;
147
148	flow = &lro_flow_list[*flow_id];
149
150	if ((flow->lr_faddr.s_addr == ip_hdr->ip_src.s_addr) &&
151			(flow->lr_laddr.s_addr == ip_hdr->ip_dst.s_addr) &&
152			(flow->lr_fport == tcp_hdr->th_sport) &&
153			(flow->lr_lport == tcp_hdr->th_dport)) {
154		if (flow->lr_tcphdr == NULL) {
155			if (ntohl(seqnum) == flow->lr_seq) {
156				return TCP_LRO_COALESCE;
157			}
158			if (lrodebug >= 4) {
159				printf("%s: seqnum = %x, lr_seq = %x\n",
160					__func__, ntohl(seqnum), flow->lr_seq);
161			}
162			lro_seq_mismatch++;
163			if (SEQ_GT(ntohl(seqnum), flow->lr_seq)) {
164				lro_seq_outoforder++;
165				/*
166				 * Whenever we receive out of order packets it
167				 * signals loss and recovery and LRO doesn't
168				 * let flows recover quickly. So eject.
169				 */
170				 flow->lr_flags |= LRO_EJECT_REQ;
171
172			}
173			return TCP_LRO_NAN;
174		}
175
176		if (flow->lr_flags & LRO_EJECT_REQ) {
177			if (lrodebug)
178				printf("%s: eject. \n", __func__);
179			return TCP_LRO_EJECT_FLOW;
180		}
181		if (SEQ_GT(tcp_hdr->th_ack, flow->lr_tcphdr->th_ack)) {
182			if (lrodebug) {
183				printf("%s: th_ack = %x flow_ack = %x \n",
184					__func__, tcp_hdr->th_ack,
185					flow->lr_tcphdr->th_ack);
186			}
187			return TCP_LRO_EJECT_FLOW;
188		}
189
190		if (ntohl(seqnum) == (ntohl(lro_flow_list[*flow_id].lr_tcphdr->th_seq) + lro_flow_list[*flow_id].lr_len)) {
191			return TCP_LRO_COALESCE;
192		} else {
193			/* LRO does not handle loss recovery well, eject */
194			flow->lr_flags |= LRO_EJECT_REQ;
195			return TCP_LRO_EJECT_FLOW;
196		}
197	}
198	if (lrodebug) printf("tcp_lro_matching_tuple: collision \n");
199	return TCP_LRO_COLLISION;
200}
201
202static void
203tcp_lro_init_flow(int flow_id, struct ip* ip_hdr, struct tcphdr *tcp_hdr,
204			int hash, u_int32_t timestamp, int payload_len)
205{
206	struct lro_flow *flow = NULL;
207
208	flow = &lro_flow_list[flow_id];
209
210	flow->lr_hash_map = hash;
211	flow->lr_faddr.s_addr = ip_hdr->ip_src.s_addr;
212	flow->lr_laddr.s_addr = ip_hdr->ip_dst.s_addr;
213	flow->lr_fport = tcp_hdr->th_sport;
214	flow->lr_lport = tcp_hdr->th_dport;
215	lro_flow_map[hash] = flow_id;
216	flow->lr_timestamp = timestamp;
217	flow->lr_seq = ntohl(tcp_hdr->th_seq) + payload_len;
218	flow->lr_flags = 0;
219	return;
220}
221
222static void
223tcp_lro_coalesce(int flow_id, struct mbuf *lro_mb, struct tcphdr *tcphdr,
224			int payload_len, int drop_hdrlen, struct tcpopt *topt,
225			u_int32_t* tsval, u_int32_t* tsecr, int thflags)
226{
227	struct lro_flow *flow = NULL;
228	struct mbuf *last;
229	struct ip *ip = NULL;
230
231	flow =  &lro_flow_list[flow_id];
232	if (flow->lr_mhead) {
233		if (lrodebug)
234			printf("%s: lr_mhead %x %d \n", __func__, flow->lr_seq,
235				payload_len);
236		m_adj(lro_mb, drop_hdrlen);
237
238		last = flow->lr_mtail;
239		while (last->m_next != NULL) {
240			last = last->m_next;
241		}
242		last->m_next = lro_mb;
243
244		flow->lr_mtail = lro_mb;
245
246		ip = mtod(flow->lr_mhead, struct ip *);
247		ip->ip_len += lro_mb->m_pkthdr.len;
248		flow->lr_mhead->m_pkthdr.len += lro_mb->m_pkthdr.len;
249
250		if (flow->lr_len == 0) {
251			panic_plain("%s: Inconsistent LRO flow state", __func__);
252		}
253		flow->lr_len += payload_len;
254		flow->lr_seq += payload_len;
255		/*
256		 * This bit is re-OR'd each time a packet is added to the
257		 * large coalesced packet.
258		 */
259		flow->lr_mhead->m_pkthdr.pkt_flags |= PKTF_SW_LRO_PKT;
260		flow->lr_mhead->m_pkthdr.lro_npkts++; /* for tcpstat.tcps_rcvpack */
261		if (flow->lr_mhead->m_pkthdr.lro_pktlen <
262				lro_mb->m_pkthdr.lro_pktlen) {
263			/*
264			 * For TCP Inter Arrival Jitter calculation, return max
265			 * size encountered while coalescing a stream of pkts.
266			 */
267			flow->lr_mhead->m_pkthdr.lro_pktlen =
268						lro_mb->m_pkthdr.lro_pktlen;
269		}
270        	/* Update the timestamp value */
271		if (topt->to_flags & TOF_TS) {
272			if ((flow->lr_tsval) &&
273				(TSTMP_GT(topt->to_tsval, ntohl(*(flow->lr_tsval))))) {
274				*(flow->lr_tsval) = htonl(topt->to_tsval);
275			}
276			if ((flow->lr_tsecr) &&
277				(topt->to_tsecr != 0) &&
278				(TSTMP_GT(topt->to_tsecr, ntohl(*(flow->lr_tsecr))))) {
279				if (lrodebug >= 2) {
280					printf("%s: instantaneous RTT = %d \n", __func__,
281						topt->to_tsecr - ntohl(*(flow->lr_tsecr)));
282				}
283				*(flow->lr_tsecr) = htonl(topt->to_tsecr);
284			}
285		}
286		/* Coalesce the flags */
287		if (thflags) {
288			flow->lr_tcphdr->th_flags |= thflags;
289		}
290		/* Update receive window */
291		flow->lr_tcphdr->th_win = tcphdr->th_win;
292	} else {
293		if (lro_mb) {
294			flow->lr_mhead = flow->lr_mtail = lro_mb;
295			flow->lr_mhead->m_pkthdr.pkt_flags |= PKTF_SW_LRO_PKT;
296			flow->lr_tcphdr = tcphdr;
297			if ((topt) && (topt->to_flags & TOF_TS)) {
298				ASSERT(tsval != NULL);
299				ASSERT(tsecr != NULL);
300				flow->lr_tsval = tsval;
301				flow->lr_tsecr = tsecr;
302			}
303			flow->lr_len = payload_len;
304			calculate_tcp_clock();
305			flow->lr_timestamp = tcp_now;
306			tcp_lro_sched_timer(0);
307		}
308		flow->lr_seq = ntohl(tcphdr->th_seq) + payload_len;
309	}
310	if (lro_mb) {
311		tcpstat.tcps_coalesced_pack++;
312	}
313	return;
314}
315
316static struct mbuf *
317tcp_lro_eject_flow(int flow_id)
318{
319	struct mbuf *mb = NULL;
320
321	mb = lro_flow_list[flow_id].lr_mhead;
322	ASSERT(lro_flow_map[lro_flow_list[flow_id].lr_hash_map] == flow_id);
323	lro_flow_map[lro_flow_list[flow_id].lr_hash_map] = TCP_LRO_FLOW_UNINIT;
324	bzero(&lro_flow_list[flow_id], sizeof(struct lro_flow));
325
326	return mb;
327}
328
329static struct mbuf*
330tcp_lro_eject_coalesced_pkt(int flow_id)
331{
332	struct mbuf *mb = NULL;
333	mb = lro_flow_list[flow_id].lr_mhead;
334	lro_flow_list[flow_id].lr_mhead =
335		lro_flow_list[flow_id].lr_mtail = NULL;
336	lro_flow_list[flow_id].lr_tcphdr = NULL;
337	return mb;
338}
339
340static struct mbuf*
341tcp_lro_insert_flow(struct mbuf *lro_mb, struct ip *ip_hdr,
342			struct tcphdr *tcp_hdr, int payload_len,
343			int drop_hdrlen, int hash, struct tcpopt *topt,
344			u_int32_t *tsval, u_int32_t *tsecr)
345{
346	int i;
347	int slot_available = 0;
348	int candidate_flow = 0;
349	u_int32_t oldest_timestamp;
350	struct mbuf *mb = NULL;
351	int collision = 0;
352
353	oldest_timestamp = tcp_now;
354
355	/* handle collision */
356	if (lro_flow_map[hash] != TCP_LRO_FLOW_UNINIT) {
357		if (lrodebug) {
358			collision = 1;
359		}
360		candidate_flow = lro_flow_map[hash];
361		tcpstat.tcps_flowtbl_collision++;
362		goto kick_flow;
363	}
364
365	for (i = 0; i < TCP_LRO_NUM_FLOWS; i++) {
366		if (lro_flow_list[i].lr_mhead == NULL) {
367			candidate_flow = i;
368			slot_available = 1;
369			break;
370		}
371		if (oldest_timestamp >= lro_flow_list[i].lr_timestamp) {
372			candidate_flow = i;
373			oldest_timestamp = lro_flow_list[i].lr_timestamp;
374		}
375	}
376
377	if (!slot_available) {
378		tcpstat.tcps_flowtbl_full++;
379kick_flow:
380		/* kick the oldest flow */
381		mb = tcp_lro_eject_flow(candidate_flow);
382
383		if (lrodebug) {
384			if (!slot_available) {
385				printf("%s: slot unavailable.\n",__func__);
386			}
387			if (collision) {
388				printf("%s: collision.\n",__func__);
389			}
390		}
391	} else {
392		candidate_flow = i; /* this is now the flow to be used */
393
394	}
395
396	tcp_lro_init_flow(candidate_flow, ip_hdr, tcp_hdr, hash,
397				tcp_now, payload_len);
398	tcp_lro_coalesce(candidate_flow, lro_mb, tcp_hdr, payload_len,
399				drop_hdrlen, topt, tsval, tsecr, 0);
400	return mb;
401}
402
403struct mbuf*
404tcp_lro_process_pkt(struct mbuf *lro_mb, struct ip *ip_hdr,
405				struct tcphdr *tcp_hdr, int drop_hdrlen)
406{
407	int flow_id = TCP_LRO_FLOW_UNINIT;
408	int hash;
409	unsigned int off = 0;
410	int eject_flow = 0;
411	int optlen;
412	int retval = 0;
413	struct mbuf *mb = NULL;
414	int payload_len = 0;
415	u_char *optp = NULL;
416	int thflags = 0;
417	struct tcpopt to;
418	int ret_response = TCP_LRO_CONSUMED;
419	int coalesced = 0, tcpflags = 0, unknown_tcpopts = 0;
420	u_int8_t ecn;
421
422	if (lro_mb->m_len < (int32_t)sizeof (struct tcpiphdr)) {
423		if ((lro_mb = m_pullup(lro_mb, sizeof(struct tcpiphdr))) == 0) {
424			tcpstat.tcps_rcvshort++;
425			m_freem(lro_mb);
426			if (lrodebug) {
427				printf("tcp_lro_process_pkt:mbuf too short.\n");
428			}
429			return NULL;
430		}
431	}
432
433	/* Just in case */
434	lro_mb->m_pkthdr.pkt_flags &= ~PKTF_SW_LRO_DID_CSUM;
435
436	if ((lro_mb = lro_tcp_xsum_validate(lro_mb, ip_hdr, tcp_hdr)) == NULL) {
437		if (lrodebug) {
438			printf("tcp_lro_process_pkt: TCP xsum failed.\n");
439		}
440		return NULL;
441	}
442
443	/* Update stats */
444	lro_pkt_count++;
445
446	/* Avoids checksumming in tcp_input */
447	lro_mb->m_pkthdr.pkt_flags |= PKTF_SW_LRO_DID_CSUM;
448
449	off = tcp_hdr->th_off << 2;
450	optlen = off - sizeof (struct tcphdr);
451	payload_len = ip_hdr->ip_len - off;
452	optp = (u_char *)(tcp_hdr + 1);
453	/*
454	 * Do quick retrieval of timestamp options ("options
455	 * prediction?").  If timestamp is the only option and it's
456	 * formatted as recommended in RFC 1323 appendix A, we
457	 * quickly get the values now and not bother calling
458	 * tcp_dooptions(), etc.
459	 */
460	if ((optlen == TCPOLEN_TSTAMP_APPA ||
461			(optlen > TCPOLEN_TSTAMP_APPA &&
462			optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) &&
463			*(u_int32_t *)optp == htonl(TCPOPT_TSTAMP_HDR) &&
464			(tcp_hdr->th_flags & TH_SYN) == 0) {
465			to.to_flags |= TOF_TS;
466			to.to_tsval = ntohl(*(u_int32_t *)(void *)(optp + 4));
467			to.to_tsecr = ntohl(*(u_int32_t *)(void *)(optp + 8));
468	} else {
469		/*
470		 * If TCP timestamps are not in use, or not the first option,
471		 * skip LRO path since timestamps are used to avoid LRO
472		 * from introducing additional latencies for retransmissions
473		 * and other slow-paced transmissions.
474		 */
475		to.to_flags = to.to_tsecr = 0;
476		eject_flow = 1;
477	}
478
479	/* list all the conditions that can trigger a flow ejection here */
480
481	thflags = tcp_hdr->th_flags;
482	if (thflags & (TH_SYN | TH_URG | TH_ECE | TH_CWR | TH_PUSH | TH_RST | TH_FIN)) {
483		eject_flow = tcpflags = 1;
484	}
485
486	if (optlen && !((optlen == TCPOLEN_TSTAMP_APPA) &&
487			(to.to_flags & TOF_TS))) {
488		eject_flow = unknown_tcpopts = 1;
489	}
490
491	if (payload_len <= LRO_MIN_COALESC_SZ) { /* zero payload ACK */
492		eject_flow = 1;
493	}
494
495	/* Can't coalesce ECN marked packets. */
496	ecn = ip_hdr->ip_tos & IPTOS_ECN_MASK;
497	if (ecn == IPTOS_ECN_CE) {
498		/*
499		 * ECN needs quick notification
500		 */
501		if (lrodebug) {
502			printf("%s: ECE bits set.\n", __func__);
503		}
504		eject_flow = 1;
505	}
506
507	lck_mtx_lock_spin(&tcp_lro_lock);
508
509	retval = tcp_lro_matching_tuple(ip_hdr, tcp_hdr, &hash, &flow_id);
510
511	switch (retval) {
512	case TCP_LRO_NAN:
513		lck_mtx_unlock(&tcp_lro_lock);
514		ret_response = TCP_LRO_FLOW_NOTFOUND;
515		break;
516
517	case TCP_LRO_COALESCE:
518		if ((payload_len != 0) && (unknown_tcpopts == 0) &&
519			(tcpflags == 0) && (ecn != IPTOS_ECN_CE) && (to.to_flags & TOF_TS)) {
520			tcp_lro_coalesce(flow_id, lro_mb, tcp_hdr, payload_len,
521				drop_hdrlen, &to,
522				(to.to_flags & TOF_TS) ? (u_int32_t *)(void *)(optp + 4) : NULL,
523				(to.to_flags & TOF_TS) ? (u_int32_t *)(void *)(optp + 8) : NULL,
524				thflags);
525			if (lrodebug >= 2) {
526				printf("tcp_lro_process_pkt: coalesce len = %d. flow_id = %d payload_len = %d drop_hdrlen = %d optlen = %d lport = %d seqnum = %x.\n",
527					lro_flow_list[flow_id].lr_len, flow_id,
528					payload_len, drop_hdrlen, optlen,
529					ntohs(lro_flow_list[flow_id].lr_lport),
530					ntohl(tcp_hdr->th_seq));
531			}
532			if (lro_flow_list[flow_id].lr_mhead->m_pkthdr.lro_npkts >= coalesc_sz) {
533				eject_flow = 1;
534			}
535			coalesced = 1;
536		}
537		if (eject_flow) {
538			mb = tcp_lro_eject_coalesced_pkt(flow_id);
539			lro_flow_list[flow_id].lr_seq = ntohl(tcp_hdr->th_seq) +
540								payload_len;
541			calculate_tcp_clock();
542			u_int8_t timestamp = tcp_now - lro_flow_list[flow_id].lr_timestamp;
543			lck_mtx_unlock(&tcp_lro_lock);
544			if (mb) {
545				mb->m_pkthdr.lro_elapsed = timestamp;
546				lro_proto_input(mb);
547			}
548			if (!coalesced) {
549				if (lrodebug >= 2) {
550					printf("%s: pkt payload_len = %d \n", __func__, payload_len);
551				}
552				lro_proto_input(lro_mb);
553			}
554		} else {
555			lck_mtx_unlock(&tcp_lro_lock);
556		}
557		break;
558
559	case TCP_LRO_EJECT_FLOW:
560		mb = tcp_lro_eject_coalesced_pkt(flow_id);
561		calculate_tcp_clock();
562		u_int8_t timestamp = tcp_now - lro_flow_list[flow_id].lr_timestamp;
563		lck_mtx_unlock(&tcp_lro_lock);
564		if (mb) {
565			if (lrodebug)
566				printf("tcp_lro_process_pkt eject_flow, len = %d\n", mb->m_pkthdr.len);
567			mb->m_pkthdr.lro_elapsed = timestamp;
568			lro_proto_input(mb);
569		}
570
571		lro_proto_input(lro_mb);
572		break;
573
574	case TCP_LRO_COLLISION:
575		lck_mtx_unlock(&tcp_lro_lock);
576		ret_response = TCP_LRO_FLOW_NOTFOUND;
577		break;
578
579	default:
580		lck_mtx_unlock(&tcp_lro_lock);
581		panic_plain("%s: unrecognized type %d", __func__, retval);
582		break;
583	}
584
585	if (ret_response == TCP_LRO_FLOW_NOTFOUND) {
586		lro_proto_input(lro_mb);
587	}
588	return NULL;
589}
590
591static void
592tcp_lro_timer_proc(void *arg1, void *arg2)
593{
594#pragma unused(arg1, arg2)
595
596	lck_mtx_lock_spin(&tcp_lro_lock);
597	lro_timer_set = 0;
598	lck_mtx_unlock(&tcp_lro_lock);
599	tcp_lro_flush_flows();
600}
601
602static void
603tcp_lro_flush_flows(void)
604{
605	int i = 0;
606	struct mbuf *mb;
607	struct lro_flow *flow;
608	int tcpclock_updated = 0;
609
610	lck_mtx_lock(&tcp_lro_lock);
611
612	while (i < TCP_LRO_NUM_FLOWS) {
613		flow = &lro_flow_list[i];
614		if (flow->lr_mhead != NULL) {
615
616			if (!tcpclock_updated) {
617				calculate_tcp_clock();
618				tcpclock_updated = 1;
619			}
620
621			if (lrodebug >= 2)
622				printf("tcp_lro_flush_flows: len =%d n_pkts = %d %d %d \n",
623					flow->lr_len,
624					flow->lr_mhead->m_pkthdr.lro_npkts,
625					flow->lr_timestamp, tcp_now);
626
627			u_int8_t timestamp = tcp_now - flow->lr_timestamp;
628
629			mb = tcp_lro_eject_flow(i);
630
631			if (mb) {
632				mb->m_pkthdr.lro_elapsed = timestamp;
633				lck_mtx_unlock(&tcp_lro_lock);
634				lro_update_flush_stats(mb);
635				lro_proto_input(mb);
636				lck_mtx_lock(&tcp_lro_lock);
637			}
638		}
639		i++;
640	}
641	lck_mtx_unlock(&tcp_lro_lock);
642}
643
644/*
645 * Must be called with tcp_lro_lock held.
646 * The hint is non-zero for longer waits. The wait time dictated by coalesc_time
647 * takes precedence, so lro_timer_set is not set for the hint case
648 */
649static void
650tcp_lro_sched_timer(uint64_t hint)
651{
652	if (lro_timer_set) {
653		return;
654	}
655
656	lro_timer_set = 1;
657	if (!hint) {
658		/* the intent is to wake up every coalesc_time msecs */
659		clock_interval_to_deadline(coalesc_time,
660			(NSEC_PER_SEC / TCP_RETRANSHZ), &lro_deadline);
661	} else {
662		clock_interval_to_deadline(hint, NSEC_PER_SEC / TCP_RETRANSHZ,
663                        &lro_deadline);
664	}
665	thread_call_enter_delayed(tcp_lro_timer, lro_deadline);
666}
667
668struct mbuf*
669tcp_lro(struct mbuf *m, unsigned int hlen)
670{
671	struct ip *ip_hdr;
672	unsigned int tlen;
673	struct tcphdr * tcp_hdr = NULL;
674	unsigned int off = 0;
675
676	if (kipf_count != 0)
677		return m;
678
679	/*
680	 * Experiments on cellular show that the RTT is much higher
681	 * than the coalescing time of 5 msecs, causing lro to flush
682	 * 80% of the time on a single packet. Increasing
683	 * coalescing time for cellular does not show marked
684	 * improvement to throughput either. Loopback perf is hurt
685	 * by the 5 msec latency and it already sends large packets.
686	 */
687	if (IFNET_IS_CELLULAR(m->m_pkthdr.rcvif) ||
688		(m->m_pkthdr.rcvif->if_type == IFT_LOOP)) {
689		return m;
690	}
691
692	ip_hdr = mtod(m, struct ip*);
693
694	/* don't deal with IP options */
695	if (hlen > sizeof (struct ip))
696		return (m);
697
698	/* only TCP is coalesced */
699	if (ip_hdr->ip_p != IPPROTO_TCP) {
700		return m;
701	}
702
703	if (m->m_len < (int32_t) sizeof (struct tcpiphdr)) {
704		if (lrodebug) printf("tcp_lro m_pullup \n");
705		if ((m = m_pullup(m, sizeof (struct tcpiphdr))) == 0) {
706			tcpstat.tcps_rcvshort++;
707			if (lrodebug) {
708				printf("ip_lro: rcvshort.\n");
709			}
710			return NULL;
711		}
712	}
713
714	tcp_hdr = (struct tcphdr *)((caddr_t)ip_hdr + hlen);
715	tlen = ip_hdr->ip_len ; //ignore IP header bytes len
716	m->m_pkthdr.lro_pktlen = tlen; /* Used to return max pkt encountered to tcp */
717	m->m_pkthdr.lro_npkts = 1; /* Initialize a counter to hold num pkts coalesced */
718	m->m_pkthdr.lro_elapsed = 0; /* Initialize the field to carry elapsed time */
719	off = tcp_hdr->th_off << 2;
720	if (off < sizeof (struct tcphdr) || off > tlen) {
721		tcpstat.tcps_rcvbadoff++;
722		if (lrodebug) {
723			printf("ip_lro: TCP off greater than TCP header.\n");
724		}
725		return m;
726	}
727
728	return (tcp_lro_process_pkt(m, ip_hdr, tcp_hdr, hlen + off));
729}
730
731static void
732lro_proto_input(struct mbuf *m)
733{
734	struct ip* ip_hdr = mtod(m, struct ip*);
735
736	if (lrodebug >= 3) {
737		printf("lro_proto_input: ip_len = %d \n",
738			ip_hdr->ip_len);
739	}
740	lro_update_stats(m);
741	ip_proto_dispatch_in_wrapper(m, ip_hdr->ip_hl << 2, ip_hdr->ip_p);
742}
743
744static struct mbuf *
745lro_tcp_xsum_validate(struct mbuf *m, struct ip *ip, struct tcphdr * th)
746{
747	/* Expect 32-bit aligned data pointer on strict-align platforms */
748	MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m);
749
750	/* we shouldn't get here for IP with options; hence sizeof (ip) */
751	if (tcp_input_checksum(AF_INET, m, th, sizeof (*ip), ip->ip_len)) {
752		if (lrodebug)
753			printf("%s: bad xsum and drop m = 0x%llx.\n", __func__,
754			(uint64_t)VM_KERNEL_ADDRPERM(m));
755		m_freem(m);
756		return (NULL);
757	}
758
759	return (m);
760}
761
762/*
763 * When TCP detects a stable, steady flow without out of ordering,
764 * with a sufficiently high cwnd, it invokes LRO.
765 */
766int
767tcp_start_coalescing(struct ip *ip_hdr, struct tcphdr *tcp_hdr, int tlen)
768{
769	int hash;
770	int flow_id;
771	struct mbuf *eject_mb;
772	struct lro_flow *lf;
773
774	hash = LRO_HASH(ip_hdr->ip_src.s_addr, ip_hdr->ip_dst.s_addr,
775		tcp_hdr->th_sport, tcp_hdr->th_dport,
776		(TCP_LRO_FLOW_MAP - 1));
777
778
779	lck_mtx_lock_spin(&tcp_lro_lock);
780	flow_id = lro_flow_map[hash];
781	if (flow_id != TCP_LRO_FLOW_NOTFOUND) {
782		lf = &lro_flow_list[flow_id];
783		if ((lf->lr_faddr.s_addr == ip_hdr->ip_src.s_addr) &&
784		    (lf->lr_laddr.s_addr == ip_hdr->ip_dst.s_addr) &&
785		    (lf->lr_fport == tcp_hdr->th_sport) &&
786		    (lf->lr_lport == tcp_hdr->th_dport)) {
787		    	if ((lf->lr_tcphdr == NULL) &&
788		    		(lf->lr_seq != (tcp_hdr->th_seq + tlen))) {
789				lf->lr_seq = tcp_hdr->th_seq + tlen;
790			}
791			lf->lr_flags &= ~LRO_EJECT_REQ;
792		}
793		lck_mtx_unlock(&tcp_lro_lock);
794		return 0;
795	}
796
797	HTONL(tcp_hdr->th_seq);
798	HTONL(tcp_hdr->th_ack);
799	eject_mb =
800		tcp_lro_insert_flow(NULL, ip_hdr, tcp_hdr, tlen, 0, hash,
801		NULL, NULL, NULL);
802
803	lck_mtx_unlock(&tcp_lro_lock);
804
805	NTOHL(tcp_hdr->th_seq);
806	NTOHL(tcp_hdr->th_ack);
807	if (lrodebug >= 3) {
808		printf("%s: src = %x dst = %x sport = %d dport = %d seq %x \n",
809			__func__, ip_hdr->ip_src.s_addr, ip_hdr->ip_dst.s_addr,
810			tcp_hdr->th_sport, tcp_hdr->th_dport, tcp_hdr->th_seq);
811	}
812	ASSERT(eject_mb == NULL);
813	return 0;
814}
815
816/*
817 * When TCP detects loss or idle condition, it stops offloading
818 * to LRO.
819 */
820int
821tcp_lro_remove_state(struct in_addr saddr, struct in_addr daddr,
822		unsigned short sport, unsigned short dport)
823{
824	int hash, flow_id;
825	struct lro_flow *lf;
826
827	hash = LRO_HASH(daddr.s_addr, saddr.s_addr, dport, sport,
828		(TCP_LRO_FLOW_MAP - 1));
829	lck_mtx_lock_spin(&tcp_lro_lock);
830	flow_id = lro_flow_map[hash];
831	if (flow_id == TCP_LRO_FLOW_UNINIT) {
832		lck_mtx_unlock(&tcp_lro_lock);
833		return 0;
834	}
835	lf = &lro_flow_list[flow_id];
836	if ((lf->lr_faddr.s_addr == daddr.s_addr) &&
837	    (lf->lr_laddr.s_addr == saddr.s_addr) &&
838	    (lf->lr_fport == dport) &&
839	    (lf->lr_lport == sport)) {
840		if (lrodebug) {
841			printf("%s: %x %x\n", __func__,
842				lf->lr_flags, lf->lr_seq);
843		}
844		lf->lr_flags |= LRO_EJECT_REQ;
845	}
846	lck_mtx_unlock(&tcp_lro_lock);
847	return 0;
848}
849
850void
851tcp_update_lro_seq(__uint32_t rcv_nxt, struct in_addr saddr, struct in_addr daddr,
852		unsigned short sport, unsigned short dport)
853{
854	int hash, flow_id;
855	struct lro_flow *lf;
856
857	hash = LRO_HASH(daddr.s_addr, saddr.s_addr, dport, sport,
858		(TCP_LRO_FLOW_MAP - 1));
859	lck_mtx_lock_spin(&tcp_lro_lock);
860	flow_id = lro_flow_map[hash];
861	if (flow_id == TCP_LRO_FLOW_UNINIT) {
862		lck_mtx_unlock(&tcp_lro_lock);
863		return;
864	}
865	lf = &lro_flow_list[flow_id];
866	if ((lf->lr_faddr.s_addr == daddr.s_addr) &&
867	    (lf->lr_laddr.s_addr == saddr.s_addr) &&
868	    (lf->lr_fport == dport) &&
869	    (lf->lr_lport == sport) &&
870	    (lf->lr_tcphdr == NULL)) {
871		lf->lr_seq = (tcp_seq)rcv_nxt;
872	}
873	lck_mtx_unlock(&tcp_lro_lock);
874	return;
875}
876
877static void
878lro_update_stats(struct mbuf *m)
879{
880	switch(m->m_pkthdr.lro_npkts) {
881	case 0: /* fall through */
882	case 1:
883		break;
884
885	case 2:
886		tcpstat.tcps_lro_twopack++;
887		break;
888
889	case 3: /* fall through */
890	case 4:
891		tcpstat.tcps_lro_multpack++;
892		break;
893
894	default:
895		tcpstat.tcps_lro_largepack++;
896		break;
897	}
898	return;
899}
900
901static void
902lro_update_flush_stats(struct mbuf *m)
903{
904	lro_flushes++;
905	switch(m->m_pkthdr.lro_npkts) {
906	case 0: ASSERT(0);
907	case 1: lro_single_flushes++;
908		break;
909	case 2: lro_double_flushes++;
910		break;
911	default: lro_good_flushes++;
912		break;
913	}
914	return;
915}
916