tcp_offload.h revision 174704
1/*-
2 * Copyright (c) 2007, Chelsio Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 *    this list of conditions and the following disclaimer.
10 *
11 * 2. Neither the name of the Chelsio Corporation nor the names of its
12 *    contributors may be used to endorse or promote products derived from
13 *    this software without specific prior written permission.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
19 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25 * POSSIBILITY OF SUCH DAMAGE.
26 *
27 * $FreeBSD: head/sys/netinet/tcp_offload.h 174704 2007-12-17 07:56:27Z kmacy $
28 */
29
30#ifndef _NETINET_TCP_OFFLOAD_H_
31#define	_NETINET_TCP_OFFLOAD_H_
32
33#ifndef _KERNEL
34#error "no user-serviceable parts inside"
35#endif
36
37/*
38 * A driver publishes that it provides offload services
39 * by setting IFCAP_TOE in the ifnet. The offload connect
40 * will bypass any further work if the interface that a
41 * connection would use does not support TCP offload.
42 *
43 * The TOE API assumes that the tcp offload engine can offload the
44 * the entire connection from set up to teardown, with some provision
45 * being made to allowing the software stack to handle time wait. If
46 * the device does not meet these criteria, it is the driver's responsibility
47 * to overload the functions that it needs to in tcp_usrreqs and make
48 * its own calls to tcp_output if it needs to do so.
49 *
50 * There is currently no provision for the device advertising the congestion
51 * control algorithms it supports as there is currently no API for querying
52 * an operating system for the protocols that it has loaded. This is a desirable
53 * future extension.
54 *
55 *
56 *
57 * It is assumed that individuals deploying TOE will want connections
58 * to be offloaded without software changes so all connections on an
59 * interface providing TOE are offloaded unless the the SO_NO_OFFLOAD
60 * flag is set on the socket.
61 *
62 *
63 * The toe_usrreqs structure constitutes the TOE driver's
64 * interface to the TCP stack for functionality that doesn't
65 * interact directly with userspace. If one wants to provide
66 * (optional) functionality to do zero-copy to/from
67 * userspace one still needs to override soreceive/sosend
68 * with functions that fault in and pin the user buffers.
69 *
70 * + tu_send
71 *   - tells the driver that new data may have been added to the
72 *     socket's send buffer - the driver should not fail if the
73 *     buffer is in fact unchanged
74 *   - the driver is responsible for providing credits (bytes in the send window)
75 *     back to the socket by calling sbdrop() as segments are acknowledged.
76 *   - The driver expects the inpcb lock to be held - the driver is expected
77 *     not to drop the lock. Hence the driver is not allowed to acquire the
78 *     pcbinfo lock during this call.
79 *
80 * + tu_rcvd
81 *   - returns credits to the driver and triggers window updates
82 *     to the peer (a credit as used here is a byte in the peer's receive window)
83 *   - the driver is expected to determine how many bytes have been
84 *     consumed and credit that back to the card so that it can grow
85 *     the window again by maintaining its own state between invocations.
86 *   - In principle this could be used to shrink the window as well as
87 *     grow the window, although it is not used for that now.
88 *   - this function needs to correctly handle being called any number of
89 *     times without any bytes being consumed from the receive buffer.
90 *   - The driver expects the inpcb lock to be held - the driver is expected
91 *     not to drop the lock. Hence the driver is not allowed to acquire the
92 *     pcbinfo lock during this call.
93 *
94 * + tu_disconnect
95 *   - tells the driver to send FIN to peer
96 *   - driver is expected to send the remaining data and then do a clean half close
97 *   - disconnect implies at least half-close so only send, reset, and detach
98 *     are legal
99 *   - the driver is expected to handle transition through the shutdown
100 *     state machine and allow the stack to support SO_LINGER.
101 *   - The driver expects the inpcb lock to be held - the driver is expected
102 *     not to drop the lock. Hence the driver is not allowed to acquire the
103 *     pcbinfo lock during this call.
104 *
105 * + tu_reset
106 *   - closes the connection and sends a RST to peer
107 *   - driver is expectd to trigger an RST and detach the toepcb
108 *   - no further calls are legal after reset
109 *   - The driver expects the inpcb lock to be held - the driver is expected
110 *     not to drop the lock. Hence the driver is not allowed to acquire the
111 *     pcbinfo lock during this call.
112 *
113 *   The following fields in the tcpcb are expected to be referenced by the driver:
114 *	+ iss
115 *	+ rcv_nxt
116 *	+ rcv_wnd
117 *	+ snd_isn
118 *	+ snd_max
119 *	+ snd_nxt
120 *	+ snd_una
121 *	+ t_flags
122 *	+ t_inpcb
123 *	+ t_maxseg
124 *	+ t_toe
125 *
126 *   The following fields in the inpcb are expected to be referenced by the driver:
127 *	+ inp_lport
128 *	+ inp_fport
129 *	+ inp_laddr
130 *	+ inp_fport
131 *	+ inp_socket
132 *	+ inp_ip_tos
133 *
134 *   The following fields in the socket are expected to be referenced by the
135 *   driver:
136 *	+ so_comp
137 *	+ so_error
138 *	+ so_linger
139 *	+ so_options
140 *	+ so_rcv
141 *	+ so_snd
142 *	+ so_state
143 *	+ so_timeo
144 *
145 *   These functions all return 0 on success and can return the following errors
146 *   as appropriate:
147 *	+ EPERM:
148 *	+ ENOBUFS: memory allocation failed
149 *	+ EMSGSIZE: MTU changed during the call
150 *	+ EHOSTDOWN:
151 *	+ EHOSTUNREACH:
152 *	+ ENETDOWN:
153 *	* ENETUNREACH: the peer is no longer reachable
154 *
155 * + tu_detach
156 *   - tells driver that the socket is going away so disconnect
157 *     the toepcb and free appropriate resources
158 *   - allows the driver to cleanly handle the case of connection state
159 *     outliving the socket
160 *   - no further calls are legal after detach
161 *   - the driver is expected to provide its own synchronization between
162 *     detach and receiving new data.
163 *
164 * + tu_syncache_event
165 *   - even if it is not actually needed, the driver is expected to
166 *     call syncache_add for the initial SYN and then syncache_expand
167 *     for the SYN,ACK
168 *   - tells driver that a connection either has not been added or has
169 *     been dropped from the syncache
170 *   - the driver is expected to maintain state that lives outside the
171 *     software stack so the syncache needs to be able to notify the
172 *     toe driver that the software stack is not going to create a connection
173 *     for a received SYN
174 *   - The driver is responsible for any synchronization required between
175 *     the syncache dropping an entry and the driver processing the SYN,ACK.
176 *
177 */
178struct toe_usrreqs {
179	int (*tu_send)(struct tcpcb *tp);
180	int (*tu_rcvd)(struct tcpcb *tp);
181	int (*tu_disconnect)(struct tcpcb *tp);
182	int (*tu_reset)(struct tcpcb *tp);
183	void (*tu_detach)(struct tcpcb *tp);
184	void (*tu_syncache_event)(int event, void *toep);
185};
186
187#define	TOE_SC_ENTRY_PRESENT		1	/* 4-tuple already present */
188#define	TOE_SC_DROP			2	/* connection was timed out */
189
190/*
191 * Because listen is a one-to-many relationship (a socket can be listening
192 * on all interfaces on a machine some of which may be using different TCP
193 * offload devices), listen uses a publish/subscribe mechanism. The TCP
194 * offload driver registers a listen notification function with the stack.
195 * When a listen socket is created all TCP offload devices are notified
196 * so that they can do the appropriate set up to offload connections on the
197 * port to which the socket is bound. When the listen socket is closed,
198 * the offload devices are notified so that they will stop listening on that
199 * port and free any associated resources as well as sending RSTs on any
200 * connections in the SYN_RCVD state.
201 *
202 */
203
204typedef	void	(*tcp_offload_listen_start_fn)(void *, struct tcpcb *);
205typedef	void	(*tcp_offload_listen_stop_fn)(void *, struct tcpcb *);
206
207EVENTHANDLER_DECLARE(tcp_offload_listen_start, tcp_offload_listen_start_fn);
208EVENTHANDLER_DECLARE(tcp_offload_listen_stop, tcp_offload_listen_stop_fn);
209
210/*
211 * Check if the socket can be offloaded by the following steps:
212 * - determine the egress interface
213 * - check the interface for TOE capability and TOE is enabled
214 * - check if the device has resources to offload the connection
215 */
216int	tcp_offload_connect(struct socket *so, struct sockaddr *nam);
217
218/*
219 * The tcp_gen_* routines are wrappers around the toe_usrreqs calls,
220 * in the non-offloaded case they translate to tcp_output.
221 *
222 * Listen is a special case because it is a 1 to many relationship
223 * and there can be more than one offload driver in the system.
224 */
225
226/*
227 * Connection is offloaded
228 */
229#define	tp_offload(tp)		((tp)->t_flags & TF_TOE)
230/*
231 * The socket has not been marked as "do not offload"
232 */
233#define	SO_OFFLOADABLE(so)	((so->so_options & SO_NO_OFFLOAD) == 0)
234
235static __inline int
236tcp_gen_connect(struct socket *so, struct sockaddr *nam)
237{
238	struct tcpcb *tp = sototcpcb(so);
239	int error;
240
241	/*
242	 * If offload has been disabled for this socket or the
243	 * connection cannot be offloaded just call tcp_output
244	 * to start the TCP state machine.
245	 */
246#ifndef TCP_OFFLOAD_DISABLE
247	if (!SO_OFFLOADABLE(so) || (error = tcp_offload_connect(so, nam)) != 0)
248#endif
249		error = tcp_output(tp);
250	return (error);
251}
252
253static __inline int
254tcp_gen_send(struct tcpcb *tp)
255{
256
257#ifndef TCP_OFFLOAD_DISABLE
258	if (tp_offload(tp))
259		return (tp->t_tu->tu_send(tp));
260#endif
261	return (tcp_output(tp));
262}
263
264static __inline int
265tcp_gen_rcvd(struct tcpcb *tp)
266{
267
268#ifndef TCP_OFFLOAD_DISABLE
269	if (tp_offload(tp))
270		return (tp->t_tu->tu_rcvd(tp));
271#endif
272	return (tcp_output(tp));
273}
274
275static __inline int
276tcp_gen_disconnect(struct tcpcb *tp)
277{
278
279#ifndef TCP_OFFLOAD_DISABLE
280	if (tp_offload(tp))
281		return (tp->t_tu->tu_disconnect(tp));
282#endif
283	return (tcp_output(tp));
284}
285
286static __inline int
287tcp_gen_reset(struct tcpcb *tp)
288{
289
290#ifndef TCP_OFFLOAD_DISABLE
291	if (tp_offload(tp))
292		return (tp->t_tu->tu_reset(tp));
293#endif
294	return (tcp_output(tp));
295}
296
297static __inline void
298tcp_gen_detach(struct tcpcb *tp)
299{
300
301#ifndef TCP_OFFLOAD_DISABLE
302	if (tp_offload(tp))
303		tp->t_tu->tu_detach(tp);
304#endif
305}
306
307static __inline void
308tcp_gen_listen_open(struct tcpcb *tp)
309{
310
311#ifndef TCP_OFFLOAD_DISABLE
312	if (SO_OFFLOADABLE(tp->t_inpcb->inp_socket))
313		EVENTHANDLER_INVOKE(tcp_offload_listen_start, tp);
314#endif
315}
316
317static __inline void
318tcp_gen_listen_close(struct tcpcb *tp)
319{
320
321#ifndef TCP_OFFLOAD_DISABLE
322	EVENTHANDLER_INVOKE(tcp_offload_listen_stop, tp);
323#endif
324}
325
326#undef tp_offload
327#undef SO_OFFLOADABLE
328#endif /* _NETINET_TCP_OFFLOAD_H_ */
329