tcp_offload.h revision 174704
1174704Skmacy/*- 2174704Skmacy * Copyright (c) 2007, Chelsio Inc. 3174704Skmacy * All rights reserved. 4174704Skmacy * 5174704Skmacy * Redistribution and use in source and binary forms, with or without 6174704Skmacy * modification, are permitted provided that the following conditions are met: 7174704Skmacy * 8174704Skmacy * 1. Redistributions of source code must retain the above copyright notice, 9174704Skmacy * this list of conditions and the following disclaimer. 10174704Skmacy * 11174704Skmacy * 2. Neither the name of the Chelsio Corporation nor the names of its 12174704Skmacy * contributors may be used to endorse or promote products derived from 13174704Skmacy * this software without specific prior written permission. 14174704Skmacy * 15174704Skmacy * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16174704Skmacy * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17174704Skmacy * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18174704Skmacy * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 19174704Skmacy * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 20174704Skmacy * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 21174704Skmacy * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 22174704Skmacy * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 23174704Skmacy * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 24174704Skmacy * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 25174704Skmacy * POSSIBILITY OF SUCH DAMAGE. 26174704Skmacy * 27174704Skmacy * $FreeBSD: head/sys/netinet/tcp_offload.h 174704 2007-12-17 07:56:27Z kmacy $ 28174704Skmacy */ 29174704Skmacy 30174704Skmacy#ifndef _NETINET_TCP_OFFLOAD_H_ 31174704Skmacy#define _NETINET_TCP_OFFLOAD_H_ 32174704Skmacy 33174704Skmacy#ifndef _KERNEL 34174704Skmacy#error "no user-serviceable parts inside" 35174704Skmacy#endif 36174704Skmacy 37174704Skmacy/* 38174704Skmacy * A driver publishes that it provides offload services 39174704Skmacy * by setting IFCAP_TOE in the ifnet. The offload connect 40174704Skmacy * will bypass any further work if the interface that a 41174704Skmacy * connection would use does not support TCP offload. 42174704Skmacy * 43174704Skmacy * The TOE API assumes that the tcp offload engine can offload the 44174704Skmacy * the entire connection from set up to teardown, with some provision 45174704Skmacy * being made to allowing the software stack to handle time wait. If 46174704Skmacy * the device does not meet these criteria, it is the driver's responsibility 47174704Skmacy * to overload the functions that it needs to in tcp_usrreqs and make 48174704Skmacy * its own calls to tcp_output if it needs to do so. 49174704Skmacy * 50174704Skmacy * There is currently no provision for the device advertising the congestion 51174704Skmacy * control algorithms it supports as there is currently no API for querying 52174704Skmacy * an operating system for the protocols that it has loaded. This is a desirable 53174704Skmacy * future extension. 54174704Skmacy * 55174704Skmacy * 56174704Skmacy * 57174704Skmacy * It is assumed that individuals deploying TOE will want connections 58174704Skmacy * to be offloaded without software changes so all connections on an 59174704Skmacy * interface providing TOE are offloaded unless the the SO_NO_OFFLOAD 60174704Skmacy * flag is set on the socket. 61174704Skmacy * 62174704Skmacy * 63174704Skmacy * The toe_usrreqs structure constitutes the TOE driver's 64174704Skmacy * interface to the TCP stack for functionality that doesn't 65174704Skmacy * interact directly with userspace. If one wants to provide 66174704Skmacy * (optional) functionality to do zero-copy to/from 67174704Skmacy * userspace one still needs to override soreceive/sosend 68174704Skmacy * with functions that fault in and pin the user buffers. 69174704Skmacy * 70174704Skmacy * + tu_send 71174704Skmacy * - tells the driver that new data may have been added to the 72174704Skmacy * socket's send buffer - the driver should not fail if the 73174704Skmacy * buffer is in fact unchanged 74174704Skmacy * - the driver is responsible for providing credits (bytes in the send window) 75174704Skmacy * back to the socket by calling sbdrop() as segments are acknowledged. 76174704Skmacy * - The driver expects the inpcb lock to be held - the driver is expected 77174704Skmacy * not to drop the lock. Hence the driver is not allowed to acquire the 78174704Skmacy * pcbinfo lock during this call. 79174704Skmacy * 80174704Skmacy * + tu_rcvd 81174704Skmacy * - returns credits to the driver and triggers window updates 82174704Skmacy * to the peer (a credit as used here is a byte in the peer's receive window) 83174704Skmacy * - the driver is expected to determine how many bytes have been 84174704Skmacy * consumed and credit that back to the card so that it can grow 85174704Skmacy * the window again by maintaining its own state between invocations. 86174704Skmacy * - In principle this could be used to shrink the window as well as 87174704Skmacy * grow the window, although it is not used for that now. 88174704Skmacy * - this function needs to correctly handle being called any number of 89174704Skmacy * times without any bytes being consumed from the receive buffer. 90174704Skmacy * - The driver expects the inpcb lock to be held - the driver is expected 91174704Skmacy * not to drop the lock. Hence the driver is not allowed to acquire the 92174704Skmacy * pcbinfo lock during this call. 93174704Skmacy * 94174704Skmacy * + tu_disconnect 95174704Skmacy * - tells the driver to send FIN to peer 96174704Skmacy * - driver is expected to send the remaining data and then do a clean half close 97174704Skmacy * - disconnect implies at least half-close so only send, reset, and detach 98174704Skmacy * are legal 99174704Skmacy * - the driver is expected to handle transition through the shutdown 100174704Skmacy * state machine and allow the stack to support SO_LINGER. 101174704Skmacy * - The driver expects the inpcb lock to be held - the driver is expected 102174704Skmacy * not to drop the lock. Hence the driver is not allowed to acquire the 103174704Skmacy * pcbinfo lock during this call. 104174704Skmacy * 105174704Skmacy * + tu_reset 106174704Skmacy * - closes the connection and sends a RST to peer 107174704Skmacy * - driver is expectd to trigger an RST and detach the toepcb 108174704Skmacy * - no further calls are legal after reset 109174704Skmacy * - The driver expects the inpcb lock to be held - the driver is expected 110174704Skmacy * not to drop the lock. Hence the driver is not allowed to acquire the 111174704Skmacy * pcbinfo lock during this call. 112174704Skmacy * 113174704Skmacy * The following fields in the tcpcb are expected to be referenced by the driver: 114174704Skmacy * + iss 115174704Skmacy * + rcv_nxt 116174704Skmacy * + rcv_wnd 117174704Skmacy * + snd_isn 118174704Skmacy * + snd_max 119174704Skmacy * + snd_nxt 120174704Skmacy * + snd_una 121174704Skmacy * + t_flags 122174704Skmacy * + t_inpcb 123174704Skmacy * + t_maxseg 124174704Skmacy * + t_toe 125174704Skmacy * 126174704Skmacy * The following fields in the inpcb are expected to be referenced by the driver: 127174704Skmacy * + inp_lport 128174704Skmacy * + inp_fport 129174704Skmacy * + inp_laddr 130174704Skmacy * + inp_fport 131174704Skmacy * + inp_socket 132174704Skmacy * + inp_ip_tos 133174704Skmacy * 134174704Skmacy * The following fields in the socket are expected to be referenced by the 135174704Skmacy * driver: 136174704Skmacy * + so_comp 137174704Skmacy * + so_error 138174704Skmacy * + so_linger 139174704Skmacy * + so_options 140174704Skmacy * + so_rcv 141174704Skmacy * + so_snd 142174704Skmacy * + so_state 143174704Skmacy * + so_timeo 144174704Skmacy * 145174704Skmacy * These functions all return 0 on success and can return the following errors 146174704Skmacy * as appropriate: 147174704Skmacy * + EPERM: 148174704Skmacy * + ENOBUFS: memory allocation failed 149174704Skmacy * + EMSGSIZE: MTU changed during the call 150174704Skmacy * + EHOSTDOWN: 151174704Skmacy * + EHOSTUNREACH: 152174704Skmacy * + ENETDOWN: 153174704Skmacy * * ENETUNREACH: the peer is no longer reachable 154174704Skmacy * 155174704Skmacy * + tu_detach 156174704Skmacy * - tells driver that the socket is going away so disconnect 157174704Skmacy * the toepcb and free appropriate resources 158174704Skmacy * - allows the driver to cleanly handle the case of connection state 159174704Skmacy * outliving the socket 160174704Skmacy * - no further calls are legal after detach 161174704Skmacy * - the driver is expected to provide its own synchronization between 162174704Skmacy * detach and receiving new data. 163174704Skmacy * 164174704Skmacy * + tu_syncache_event 165174704Skmacy * - even if it is not actually needed, the driver is expected to 166174704Skmacy * call syncache_add for the initial SYN and then syncache_expand 167174704Skmacy * for the SYN,ACK 168174704Skmacy * - tells driver that a connection either has not been added or has 169174704Skmacy * been dropped from the syncache 170174704Skmacy * - the driver is expected to maintain state that lives outside the 171174704Skmacy * software stack so the syncache needs to be able to notify the 172174704Skmacy * toe driver that the software stack is not going to create a connection 173174704Skmacy * for a received SYN 174174704Skmacy * - The driver is responsible for any synchronization required between 175174704Skmacy * the syncache dropping an entry and the driver processing the SYN,ACK. 176174704Skmacy * 177174704Skmacy */ 178174704Skmacystruct toe_usrreqs { 179174704Skmacy int (*tu_send)(struct tcpcb *tp); 180174704Skmacy int (*tu_rcvd)(struct tcpcb *tp); 181174704Skmacy int (*tu_disconnect)(struct tcpcb *tp); 182174704Skmacy int (*tu_reset)(struct tcpcb *tp); 183174704Skmacy void (*tu_detach)(struct tcpcb *tp); 184174704Skmacy void (*tu_syncache_event)(int event, void *toep); 185174704Skmacy}; 186174704Skmacy 187174704Skmacy#define TOE_SC_ENTRY_PRESENT 1 /* 4-tuple already present */ 188174704Skmacy#define TOE_SC_DROP 2 /* connection was timed out */ 189174704Skmacy 190174704Skmacy/* 191174704Skmacy * Because listen is a one-to-many relationship (a socket can be listening 192174704Skmacy * on all interfaces on a machine some of which may be using different TCP 193174704Skmacy * offload devices), listen uses a publish/subscribe mechanism. The TCP 194174704Skmacy * offload driver registers a listen notification function with the stack. 195174704Skmacy * When a listen socket is created all TCP offload devices are notified 196174704Skmacy * so that they can do the appropriate set up to offload connections on the 197174704Skmacy * port to which the socket is bound. When the listen socket is closed, 198174704Skmacy * the offload devices are notified so that they will stop listening on that 199174704Skmacy * port and free any associated resources as well as sending RSTs on any 200174704Skmacy * connections in the SYN_RCVD state. 201174704Skmacy * 202174704Skmacy */ 203174704Skmacy 204174704Skmacytypedef void (*tcp_offload_listen_start_fn)(void *, struct tcpcb *); 205174704Skmacytypedef void (*tcp_offload_listen_stop_fn)(void *, struct tcpcb *); 206174704Skmacy 207174704SkmacyEVENTHANDLER_DECLARE(tcp_offload_listen_start, tcp_offload_listen_start_fn); 208174704SkmacyEVENTHANDLER_DECLARE(tcp_offload_listen_stop, tcp_offload_listen_stop_fn); 209174704Skmacy 210174704Skmacy/* 211174704Skmacy * Check if the socket can be offloaded by the following steps: 212174704Skmacy * - determine the egress interface 213174704Skmacy * - check the interface for TOE capability and TOE is enabled 214174704Skmacy * - check if the device has resources to offload the connection 215174704Skmacy */ 216174704Skmacyint tcp_offload_connect(struct socket *so, struct sockaddr *nam); 217174704Skmacy 218174704Skmacy/* 219174704Skmacy * The tcp_gen_* routines are wrappers around the toe_usrreqs calls, 220174704Skmacy * in the non-offloaded case they translate to tcp_output. 221174704Skmacy * 222174704Skmacy * Listen is a special case because it is a 1 to many relationship 223174704Skmacy * and there can be more than one offload driver in the system. 224174704Skmacy */ 225174704Skmacy 226174704Skmacy/* 227174704Skmacy * Connection is offloaded 228174704Skmacy */ 229174704Skmacy#define tp_offload(tp) ((tp)->t_flags & TF_TOE) 230174704Skmacy/* 231174704Skmacy * The socket has not been marked as "do not offload" 232174704Skmacy */ 233174704Skmacy#define SO_OFFLOADABLE(so) ((so->so_options & SO_NO_OFFLOAD) == 0) 234174704Skmacy 235174704Skmacystatic __inline int 236174704Skmacytcp_gen_connect(struct socket *so, struct sockaddr *nam) 237174704Skmacy{ 238174704Skmacy struct tcpcb *tp = sototcpcb(so); 239174704Skmacy int error; 240174704Skmacy 241174704Skmacy /* 242174704Skmacy * If offload has been disabled for this socket or the 243174704Skmacy * connection cannot be offloaded just call tcp_output 244174704Skmacy * to start the TCP state machine. 245174704Skmacy */ 246174704Skmacy#ifndef TCP_OFFLOAD_DISABLE 247174704Skmacy if (!SO_OFFLOADABLE(so) || (error = tcp_offload_connect(so, nam)) != 0) 248174704Skmacy#endif 249174704Skmacy error = tcp_output(tp); 250174704Skmacy return (error); 251174704Skmacy} 252174704Skmacy 253174704Skmacystatic __inline int 254174704Skmacytcp_gen_send(struct tcpcb *tp) 255174704Skmacy{ 256174704Skmacy 257174704Skmacy#ifndef TCP_OFFLOAD_DISABLE 258174704Skmacy if (tp_offload(tp)) 259174704Skmacy return (tp->t_tu->tu_send(tp)); 260174704Skmacy#endif 261174704Skmacy return (tcp_output(tp)); 262174704Skmacy} 263174704Skmacy 264174704Skmacystatic __inline int 265174704Skmacytcp_gen_rcvd(struct tcpcb *tp) 266174704Skmacy{ 267174704Skmacy 268174704Skmacy#ifndef TCP_OFFLOAD_DISABLE 269174704Skmacy if (tp_offload(tp)) 270174704Skmacy return (tp->t_tu->tu_rcvd(tp)); 271174704Skmacy#endif 272174704Skmacy return (tcp_output(tp)); 273174704Skmacy} 274174704Skmacy 275174704Skmacystatic __inline int 276174704Skmacytcp_gen_disconnect(struct tcpcb *tp) 277174704Skmacy{ 278174704Skmacy 279174704Skmacy#ifndef TCP_OFFLOAD_DISABLE 280174704Skmacy if (tp_offload(tp)) 281174704Skmacy return (tp->t_tu->tu_disconnect(tp)); 282174704Skmacy#endif 283174704Skmacy return (tcp_output(tp)); 284174704Skmacy} 285174704Skmacy 286174704Skmacystatic __inline int 287174704Skmacytcp_gen_reset(struct tcpcb *tp) 288174704Skmacy{ 289174704Skmacy 290174704Skmacy#ifndef TCP_OFFLOAD_DISABLE 291174704Skmacy if (tp_offload(tp)) 292174704Skmacy return (tp->t_tu->tu_reset(tp)); 293174704Skmacy#endif 294174704Skmacy return (tcp_output(tp)); 295174704Skmacy} 296174704Skmacy 297174704Skmacystatic __inline void 298174704Skmacytcp_gen_detach(struct tcpcb *tp) 299174704Skmacy{ 300174704Skmacy 301174704Skmacy#ifndef TCP_OFFLOAD_DISABLE 302174704Skmacy if (tp_offload(tp)) 303174704Skmacy tp->t_tu->tu_detach(tp); 304174704Skmacy#endif 305174704Skmacy} 306174704Skmacy 307174704Skmacystatic __inline void 308174704Skmacytcp_gen_listen_open(struct tcpcb *tp) 309174704Skmacy{ 310174704Skmacy 311174704Skmacy#ifndef TCP_OFFLOAD_DISABLE 312174704Skmacy if (SO_OFFLOADABLE(tp->t_inpcb->inp_socket)) 313174704Skmacy EVENTHANDLER_INVOKE(tcp_offload_listen_start, tp); 314174704Skmacy#endif 315174704Skmacy} 316174704Skmacy 317174704Skmacystatic __inline void 318174704Skmacytcp_gen_listen_close(struct tcpcb *tp) 319174704Skmacy{ 320174704Skmacy 321174704Skmacy#ifndef TCP_OFFLOAD_DISABLE 322174704Skmacy EVENTHANDLER_INVOKE(tcp_offload_listen_stop, tp); 323174704Skmacy#endif 324174704Skmacy} 325174704Skmacy 326174704Skmacy#undef tp_offload 327174704Skmacy#undef SO_OFFLOADABLE 328174704Skmacy#endif /* _NETINET_TCP_OFFLOAD_H_ */ 329