tcp_offload.h revision 174704
1174704Skmacy/*-
2174704Skmacy * Copyright (c) 2007, Chelsio Inc.
3174704Skmacy * All rights reserved.
4174704Skmacy *
5174704Skmacy * Redistribution and use in source and binary forms, with or without
6174704Skmacy * modification, are permitted provided that the following conditions are met:
7174704Skmacy *
8174704Skmacy * 1. Redistributions of source code must retain the above copyright notice,
9174704Skmacy *    this list of conditions and the following disclaimer.
10174704Skmacy *
11174704Skmacy * 2. Neither the name of the Chelsio Corporation nor the names of its
12174704Skmacy *    contributors may be used to endorse or promote products derived from
13174704Skmacy *    this software without specific prior written permission.
14174704Skmacy *
15174704Skmacy * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16174704Skmacy * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17174704Skmacy * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18174704Skmacy * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
19174704Skmacy * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20174704Skmacy * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21174704Skmacy * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22174704Skmacy * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23174704Skmacy * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24174704Skmacy * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25174704Skmacy * POSSIBILITY OF SUCH DAMAGE.
26174704Skmacy *
27174704Skmacy * $FreeBSD: head/sys/netinet/tcp_offload.h 174704 2007-12-17 07:56:27Z kmacy $
28174704Skmacy */
29174704Skmacy
30174704Skmacy#ifndef _NETINET_TCP_OFFLOAD_H_
31174704Skmacy#define	_NETINET_TCP_OFFLOAD_H_
32174704Skmacy
33174704Skmacy#ifndef _KERNEL
34174704Skmacy#error "no user-serviceable parts inside"
35174704Skmacy#endif
36174704Skmacy
37174704Skmacy/*
38174704Skmacy * A driver publishes that it provides offload services
39174704Skmacy * by setting IFCAP_TOE in the ifnet. The offload connect
40174704Skmacy * will bypass any further work if the interface that a
41174704Skmacy * connection would use does not support TCP offload.
42174704Skmacy *
43174704Skmacy * The TOE API assumes that the tcp offload engine can offload the
44174704Skmacy * the entire connection from set up to teardown, with some provision
45174704Skmacy * being made to allowing the software stack to handle time wait. If
46174704Skmacy * the device does not meet these criteria, it is the driver's responsibility
47174704Skmacy * to overload the functions that it needs to in tcp_usrreqs and make
48174704Skmacy * its own calls to tcp_output if it needs to do so.
49174704Skmacy *
50174704Skmacy * There is currently no provision for the device advertising the congestion
51174704Skmacy * control algorithms it supports as there is currently no API for querying
52174704Skmacy * an operating system for the protocols that it has loaded. This is a desirable
53174704Skmacy * future extension.
54174704Skmacy *
55174704Skmacy *
56174704Skmacy *
57174704Skmacy * It is assumed that individuals deploying TOE will want connections
58174704Skmacy * to be offloaded without software changes so all connections on an
59174704Skmacy * interface providing TOE are offloaded unless the the SO_NO_OFFLOAD
60174704Skmacy * flag is set on the socket.
61174704Skmacy *
62174704Skmacy *
63174704Skmacy * The toe_usrreqs structure constitutes the TOE driver's
64174704Skmacy * interface to the TCP stack for functionality that doesn't
65174704Skmacy * interact directly with userspace. If one wants to provide
66174704Skmacy * (optional) functionality to do zero-copy to/from
67174704Skmacy * userspace one still needs to override soreceive/sosend
68174704Skmacy * with functions that fault in and pin the user buffers.
69174704Skmacy *
70174704Skmacy * + tu_send
71174704Skmacy *   - tells the driver that new data may have been added to the
72174704Skmacy *     socket's send buffer - the driver should not fail if the
73174704Skmacy *     buffer is in fact unchanged
74174704Skmacy *   - the driver is responsible for providing credits (bytes in the send window)
75174704Skmacy *     back to the socket by calling sbdrop() as segments are acknowledged.
76174704Skmacy *   - The driver expects the inpcb lock to be held - the driver is expected
77174704Skmacy *     not to drop the lock. Hence the driver is not allowed to acquire the
78174704Skmacy *     pcbinfo lock during this call.
79174704Skmacy *
80174704Skmacy * + tu_rcvd
81174704Skmacy *   - returns credits to the driver and triggers window updates
82174704Skmacy *     to the peer (a credit as used here is a byte in the peer's receive window)
83174704Skmacy *   - the driver is expected to determine how many bytes have been
84174704Skmacy *     consumed and credit that back to the card so that it can grow
85174704Skmacy *     the window again by maintaining its own state between invocations.
86174704Skmacy *   - In principle this could be used to shrink the window as well as
87174704Skmacy *     grow the window, although it is not used for that now.
88174704Skmacy *   - this function needs to correctly handle being called any number of
89174704Skmacy *     times without any bytes being consumed from the receive buffer.
90174704Skmacy *   - The driver expects the inpcb lock to be held - the driver is expected
91174704Skmacy *     not to drop the lock. Hence the driver is not allowed to acquire the
92174704Skmacy *     pcbinfo lock during this call.
93174704Skmacy *
94174704Skmacy * + tu_disconnect
95174704Skmacy *   - tells the driver to send FIN to peer
96174704Skmacy *   - driver is expected to send the remaining data and then do a clean half close
97174704Skmacy *   - disconnect implies at least half-close so only send, reset, and detach
98174704Skmacy *     are legal
99174704Skmacy *   - the driver is expected to handle transition through the shutdown
100174704Skmacy *     state machine and allow the stack to support SO_LINGER.
101174704Skmacy *   - The driver expects the inpcb lock to be held - the driver is expected
102174704Skmacy *     not to drop the lock. Hence the driver is not allowed to acquire the
103174704Skmacy *     pcbinfo lock during this call.
104174704Skmacy *
105174704Skmacy * + tu_reset
106174704Skmacy *   - closes the connection and sends a RST to peer
107174704Skmacy *   - driver is expectd to trigger an RST and detach the toepcb
108174704Skmacy *   - no further calls are legal after reset
109174704Skmacy *   - The driver expects the inpcb lock to be held - the driver is expected
110174704Skmacy *     not to drop the lock. Hence the driver is not allowed to acquire the
111174704Skmacy *     pcbinfo lock during this call.
112174704Skmacy *
113174704Skmacy *   The following fields in the tcpcb are expected to be referenced by the driver:
114174704Skmacy *	+ iss
115174704Skmacy *	+ rcv_nxt
116174704Skmacy *	+ rcv_wnd
117174704Skmacy *	+ snd_isn
118174704Skmacy *	+ snd_max
119174704Skmacy *	+ snd_nxt
120174704Skmacy *	+ snd_una
121174704Skmacy *	+ t_flags
122174704Skmacy *	+ t_inpcb
123174704Skmacy *	+ t_maxseg
124174704Skmacy *	+ t_toe
125174704Skmacy *
126174704Skmacy *   The following fields in the inpcb are expected to be referenced by the driver:
127174704Skmacy *	+ inp_lport
128174704Skmacy *	+ inp_fport
129174704Skmacy *	+ inp_laddr
130174704Skmacy *	+ inp_fport
131174704Skmacy *	+ inp_socket
132174704Skmacy *	+ inp_ip_tos
133174704Skmacy *
134174704Skmacy *   The following fields in the socket are expected to be referenced by the
135174704Skmacy *   driver:
136174704Skmacy *	+ so_comp
137174704Skmacy *	+ so_error
138174704Skmacy *	+ so_linger
139174704Skmacy *	+ so_options
140174704Skmacy *	+ so_rcv
141174704Skmacy *	+ so_snd
142174704Skmacy *	+ so_state
143174704Skmacy *	+ so_timeo
144174704Skmacy *
145174704Skmacy *   These functions all return 0 on success and can return the following errors
146174704Skmacy *   as appropriate:
147174704Skmacy *	+ EPERM:
148174704Skmacy *	+ ENOBUFS: memory allocation failed
149174704Skmacy *	+ EMSGSIZE: MTU changed during the call
150174704Skmacy *	+ EHOSTDOWN:
151174704Skmacy *	+ EHOSTUNREACH:
152174704Skmacy *	+ ENETDOWN:
153174704Skmacy *	* ENETUNREACH: the peer is no longer reachable
154174704Skmacy *
155174704Skmacy * + tu_detach
156174704Skmacy *   - tells driver that the socket is going away so disconnect
157174704Skmacy *     the toepcb and free appropriate resources
158174704Skmacy *   - allows the driver to cleanly handle the case of connection state
159174704Skmacy *     outliving the socket
160174704Skmacy *   - no further calls are legal after detach
161174704Skmacy *   - the driver is expected to provide its own synchronization between
162174704Skmacy *     detach and receiving new data.
163174704Skmacy *
164174704Skmacy * + tu_syncache_event
165174704Skmacy *   - even if it is not actually needed, the driver is expected to
166174704Skmacy *     call syncache_add for the initial SYN and then syncache_expand
167174704Skmacy *     for the SYN,ACK
168174704Skmacy *   - tells driver that a connection either has not been added or has
169174704Skmacy *     been dropped from the syncache
170174704Skmacy *   - the driver is expected to maintain state that lives outside the
171174704Skmacy *     software stack so the syncache needs to be able to notify the
172174704Skmacy *     toe driver that the software stack is not going to create a connection
173174704Skmacy *     for a received SYN
174174704Skmacy *   - The driver is responsible for any synchronization required between
175174704Skmacy *     the syncache dropping an entry and the driver processing the SYN,ACK.
176174704Skmacy *
177174704Skmacy */
178174704Skmacystruct toe_usrreqs {
179174704Skmacy	int (*tu_send)(struct tcpcb *tp);
180174704Skmacy	int (*tu_rcvd)(struct tcpcb *tp);
181174704Skmacy	int (*tu_disconnect)(struct tcpcb *tp);
182174704Skmacy	int (*tu_reset)(struct tcpcb *tp);
183174704Skmacy	void (*tu_detach)(struct tcpcb *tp);
184174704Skmacy	void (*tu_syncache_event)(int event, void *toep);
185174704Skmacy};
186174704Skmacy
187174704Skmacy#define	TOE_SC_ENTRY_PRESENT		1	/* 4-tuple already present */
188174704Skmacy#define	TOE_SC_DROP			2	/* connection was timed out */
189174704Skmacy
190174704Skmacy/*
191174704Skmacy * Because listen is a one-to-many relationship (a socket can be listening
192174704Skmacy * on all interfaces on a machine some of which may be using different TCP
193174704Skmacy * offload devices), listen uses a publish/subscribe mechanism. The TCP
194174704Skmacy * offload driver registers a listen notification function with the stack.
195174704Skmacy * When a listen socket is created all TCP offload devices are notified
196174704Skmacy * so that they can do the appropriate set up to offload connections on the
197174704Skmacy * port to which the socket is bound. When the listen socket is closed,
198174704Skmacy * the offload devices are notified so that they will stop listening on that
199174704Skmacy * port and free any associated resources as well as sending RSTs on any
200174704Skmacy * connections in the SYN_RCVD state.
201174704Skmacy *
202174704Skmacy */
203174704Skmacy
204174704Skmacytypedef	void	(*tcp_offload_listen_start_fn)(void *, struct tcpcb *);
205174704Skmacytypedef	void	(*tcp_offload_listen_stop_fn)(void *, struct tcpcb *);
206174704Skmacy
207174704SkmacyEVENTHANDLER_DECLARE(tcp_offload_listen_start, tcp_offload_listen_start_fn);
208174704SkmacyEVENTHANDLER_DECLARE(tcp_offload_listen_stop, tcp_offload_listen_stop_fn);
209174704Skmacy
210174704Skmacy/*
211174704Skmacy * Check if the socket can be offloaded by the following steps:
212174704Skmacy * - determine the egress interface
213174704Skmacy * - check the interface for TOE capability and TOE is enabled
214174704Skmacy * - check if the device has resources to offload the connection
215174704Skmacy */
216174704Skmacyint	tcp_offload_connect(struct socket *so, struct sockaddr *nam);
217174704Skmacy
218174704Skmacy/*
219174704Skmacy * The tcp_gen_* routines are wrappers around the toe_usrreqs calls,
220174704Skmacy * in the non-offloaded case they translate to tcp_output.
221174704Skmacy *
222174704Skmacy * Listen is a special case because it is a 1 to many relationship
223174704Skmacy * and there can be more than one offload driver in the system.
224174704Skmacy */
225174704Skmacy
226174704Skmacy/*
227174704Skmacy * Connection is offloaded
228174704Skmacy */
229174704Skmacy#define	tp_offload(tp)		((tp)->t_flags & TF_TOE)
230174704Skmacy/*
231174704Skmacy * The socket has not been marked as "do not offload"
232174704Skmacy */
233174704Skmacy#define	SO_OFFLOADABLE(so)	((so->so_options & SO_NO_OFFLOAD) == 0)
234174704Skmacy
235174704Skmacystatic __inline int
236174704Skmacytcp_gen_connect(struct socket *so, struct sockaddr *nam)
237174704Skmacy{
238174704Skmacy	struct tcpcb *tp = sototcpcb(so);
239174704Skmacy	int error;
240174704Skmacy
241174704Skmacy	/*
242174704Skmacy	 * If offload has been disabled for this socket or the
243174704Skmacy	 * connection cannot be offloaded just call tcp_output
244174704Skmacy	 * to start the TCP state machine.
245174704Skmacy	 */
246174704Skmacy#ifndef TCP_OFFLOAD_DISABLE
247174704Skmacy	if (!SO_OFFLOADABLE(so) || (error = tcp_offload_connect(so, nam)) != 0)
248174704Skmacy#endif
249174704Skmacy		error = tcp_output(tp);
250174704Skmacy	return (error);
251174704Skmacy}
252174704Skmacy
253174704Skmacystatic __inline int
254174704Skmacytcp_gen_send(struct tcpcb *tp)
255174704Skmacy{
256174704Skmacy
257174704Skmacy#ifndef TCP_OFFLOAD_DISABLE
258174704Skmacy	if (tp_offload(tp))
259174704Skmacy		return (tp->t_tu->tu_send(tp));
260174704Skmacy#endif
261174704Skmacy	return (tcp_output(tp));
262174704Skmacy}
263174704Skmacy
264174704Skmacystatic __inline int
265174704Skmacytcp_gen_rcvd(struct tcpcb *tp)
266174704Skmacy{
267174704Skmacy
268174704Skmacy#ifndef TCP_OFFLOAD_DISABLE
269174704Skmacy	if (tp_offload(tp))
270174704Skmacy		return (tp->t_tu->tu_rcvd(tp));
271174704Skmacy#endif
272174704Skmacy	return (tcp_output(tp));
273174704Skmacy}
274174704Skmacy
275174704Skmacystatic __inline int
276174704Skmacytcp_gen_disconnect(struct tcpcb *tp)
277174704Skmacy{
278174704Skmacy
279174704Skmacy#ifndef TCP_OFFLOAD_DISABLE
280174704Skmacy	if (tp_offload(tp))
281174704Skmacy		return (tp->t_tu->tu_disconnect(tp));
282174704Skmacy#endif
283174704Skmacy	return (tcp_output(tp));
284174704Skmacy}
285174704Skmacy
286174704Skmacystatic __inline int
287174704Skmacytcp_gen_reset(struct tcpcb *tp)
288174704Skmacy{
289174704Skmacy
290174704Skmacy#ifndef TCP_OFFLOAD_DISABLE
291174704Skmacy	if (tp_offload(tp))
292174704Skmacy		return (tp->t_tu->tu_reset(tp));
293174704Skmacy#endif
294174704Skmacy	return (tcp_output(tp));
295174704Skmacy}
296174704Skmacy
297174704Skmacystatic __inline void
298174704Skmacytcp_gen_detach(struct tcpcb *tp)
299174704Skmacy{
300174704Skmacy
301174704Skmacy#ifndef TCP_OFFLOAD_DISABLE
302174704Skmacy	if (tp_offload(tp))
303174704Skmacy		tp->t_tu->tu_detach(tp);
304174704Skmacy#endif
305174704Skmacy}
306174704Skmacy
307174704Skmacystatic __inline void
308174704Skmacytcp_gen_listen_open(struct tcpcb *tp)
309174704Skmacy{
310174704Skmacy
311174704Skmacy#ifndef TCP_OFFLOAD_DISABLE
312174704Skmacy	if (SO_OFFLOADABLE(tp->t_inpcb->inp_socket))
313174704Skmacy		EVENTHANDLER_INVOKE(tcp_offload_listen_start, tp);
314174704Skmacy#endif
315174704Skmacy}
316174704Skmacy
317174704Skmacystatic __inline void
318174704Skmacytcp_gen_listen_close(struct tcpcb *tp)
319174704Skmacy{
320174704Skmacy
321174704Skmacy#ifndef TCP_OFFLOAD_DISABLE
322174704Skmacy	EVENTHANDLER_INVOKE(tcp_offload_listen_stop, tp);
323174704Skmacy#endif
324174704Skmacy}
325174704Skmacy
326174704Skmacy#undef tp_offload
327174704Skmacy#undef SO_OFFLOADABLE
328174704Skmacy#endif /* _NETINET_TCP_OFFLOAD_H_ */
329