kern_poll.c revision 87902
1/*- 2 * Copyright (c) 2001 Luigi Rizzo 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND 14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 16 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE 17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 23 * SUCH DAMAGE. 24 * 25 * $FreeBSD: head/sys/kern/kern_poll.c 87902 2001-12-14 17:56:12Z luigi $ 26 */ 27 28#include <sys/param.h> 29#include <sys/systm.h> 30#include <sys/kernel.h> 31#include <sys/socket.h> /* needed by net/if.h */ 32#include <sys/sysctl.h> 33 34#include <net/if.h> /* for IFF_* flags */ 35#include <net/netisr.h> /* for NETISR_POLL */ 36 37#ifdef SMP 38#error DEVICE_POLLING is not compatible with SMP 39#endif 40 41void ether_poll1(void); 42void ether_poll(int); /* polling while in trap */ 43void ether_pollmore(void); 44void hardclock_device_poll(void); 45 46/* 47 * Polling support for [network] device drivers. 48 * 49 * Drivers which support this feature try to register with the 50 * polling code. 51 * 52 * If registration is successful, the driver must disable interrupts, 53 * and further I/O is performed through the handler, which is invoked 54 * (at least once per clock tick) with 3 arguments: the "arg" passed at 55 * register time (a struct ifnet pointer), a command, and a "count" limit. 56 * 57 * The command can be one of the following: 58 * POLL_ONLY: quick move of "count" packets from input/output queues. 59 * POLL_AND_CHECK_STATUS: as above, plus check status registers or do 60 * other more expensive operations. This command is issued periodically 61 * but less frequently than POLL_ONLY. 62 * POLL_DEREGISTER: deregister and return to interrupt mode. 63 * 64 * The first two commands are only issued if the interface is marked as 65 * 'IFF_UP and IFF_RUNNING', the last one only if IFF_RUNNING is set. 66 * 67 * The count limit specifies how much work the handler can do during the 68 * call -- typically this is the number of packets to be received, or 69 * transmitted, etc. (drivers are free to interpret this number, as long 70 * as the max time spent in the function grows roughly linearly with the 71 * count). 72 * 73 * Deregistration can be requested by the driver itself (typically in the 74 * *_stop() routine), or by the polling code, by invoking the handler. 75 * 76 * Polling can be globally enabled or disabled with the sysctl variable 77 * kern.polling.enable (default is 0, disabled) 78 * 79 * A second variable controls the sharing of CPU between polling/kernel 80 * network processing, and other activities (typically userlevel tasks): 81 * kern.polling.user_frac (between 0 and 100, default 50) sets the share 82 * of CPU allocated to user tasks. CPU is allocated proportionally to the 83 * shares, by dynamically adjusting the "count" (poll_burst). 84 * 85 * Other parameters can should be left to their default values. 86 * The following constraints hold 87 * 88 * 1 <= poll_each_burst <= poll_burst <= poll_burst_max 89 * 0 <= poll_in_trap <= poll_each_burst 90 * MIN_POLL_BURST_MAX <= poll_burst_max <= MAX_POLL_BURST_MAX 91 */ 92 93#define MIN_POLL_BURST_MAX 10 94#define MAX_POLL_BURST_MAX 1000 95 96SYSCTL_NODE(_kern, OID_AUTO, polling, CTLFLAG_RW, 0, 97 "Device polling parameters"); 98 99static u_int32_t poll_burst = 5; 100SYSCTL_ULONG(_kern_polling, OID_AUTO, burst, CTLFLAG_RW, 101 &poll_burst, 0, "Current polling burst size"); 102 103static u_int32_t poll_each_burst = 5; 104SYSCTL_ULONG(_kern_polling, OID_AUTO, each_burst, CTLFLAG_RW, 105 &poll_each_burst, 0, "Max size of each burst"); 106 107static u_int32_t poll_burst_max = 150; /* good for 100Mbit net and HZ=1000 */ 108SYSCTL_ULONG(_kern_polling, OID_AUTO, burst_max, CTLFLAG_RW, 109 &poll_burst_max, 0, "Max Polling burst size"); 110 111u_int32_t poll_in_trap; /* used in trap.c */ 112SYSCTL_ULONG(_kern_polling, OID_AUTO, poll_in_trap, CTLFLAG_RW, 113 &poll_in_trap, 0, "Poll burst size during a trap"); 114 115static u_int32_t user_frac = 50; 116SYSCTL_ULONG(_kern_polling, OID_AUTO, user_frac, CTLFLAG_RW, 117 &user_frac, 0, "Desired user fraction of cpu time"); 118 119static u_int32_t reg_frac = 20 ; 120SYSCTL_ULONG(_kern_polling, OID_AUTO, reg_frac, CTLFLAG_RW, 121 ®_frac, 0, "Every this many cycles poll register"); 122 123static u_int32_t short_ticks; 124SYSCTL_ULONG(_kern_polling, OID_AUTO, short_ticks, CTLFLAG_RW, 125 &short_ticks, 0, "Hardclock ticks shorter than they should be"); 126 127static u_int32_t lost_polls; 128SYSCTL_ULONG(_kern_polling, OID_AUTO, lost_polls, CTLFLAG_RW, 129 &lost_polls, 0, "How many times we would have lost a poll tick"); 130 131static u_int32_t poll_handlers; /* next free entry in pr[]. */ 132SYSCTL_ULONG(_kern_polling, OID_AUTO, handlers, CTLFLAG_RD, 133 &poll_handlers, 0, "Number of registered poll handlers"); 134 135static int polling = 0; /* global polling enable */ 136SYSCTL_ULONG(_kern_polling, OID_AUTO, enable, CTLFLAG_RW, 137 &polling, 0, "Polling enabled"); 138 139 140static u_int32_t poll1_active; 141static u_int32_t need_poll_again; 142 143#define POLL_LIST_LEN 128 144struct pollrec { 145 poll_handler_t *handler; 146 struct ifnet *ifp; 147}; 148 149static struct pollrec pr[POLL_LIST_LEN]; 150 151/* 152 * Hook from hardclock. Tries to schedule a netisr, but keeps track 153 * of lost ticks due to the previous handler taking too long. 154 * The first part of the code is just for debugging purposes, and tries 155 * to count how often hardclock ticks are shorter than they should, 156 * meaning either stray interrupts or delayed events. 157 */ 158void 159hardclock_device_poll(void) 160{ 161 static struct timeval prev_t, t; 162 int delta; 163 164 microuptime(&t); 165 delta = (t.tv_usec - prev_t.tv_usec) + 166 (t.tv_sec - prev_t.tv_sec)*1000000; 167 if (delta * hz < 500000) 168 short_ticks++; 169 else 170 prev_t = t; 171 172 if (poll_handlers > 0) { 173 if (poll1_active) { 174 lost_polls++; 175 need_poll_again++; 176 } else { 177 poll1_active = 1; 178 schednetisr(NETISR_POLL); 179 } 180 } 181} 182 183/* 184 * ether_poll is called from the idle loop or from the trap handler. 185 */ 186void 187ether_poll(int count) 188{ 189 int i; 190 int s = splimp(); 191 192 mtx_lock(&Giant); 193 194 if (count > poll_each_burst) 195 count = poll_each_burst; 196 for (i = 0 ; i < poll_handlers ; i++) 197 if (pr[i].handler && (IFF_UP|IFF_RUNNING) == 198 (pr[i].ifp->if_flags & (IFF_UP|IFF_RUNNING)) ) 199 pr[i].handler(pr[i].ifp, 0, count); /* quick check */ 200 mtx_unlock(&Giant); 201 splx(s); 202} 203 204/* 205 * ether_pollmore is called after other netisr's, possibly scheduling 206 * another NETISR_POLL call, or adapting the burst size for the next cycle. 207 * 208 * It is very bad to fetch large bursts of packets from a single card at once, 209 * because the burst could take a long time to be completely processed, or 210 * could saturate the intermediate queue (ipintrq or similar) leading to 211 * losses or unfairness. To reduce the problem, and also to account better for 212 * time spent in network-related processnig, we split the burst in smaller 213 * chunks of fixed size, giving control to the other netisr's between chunks. 214 * This helps in improving the fairness, reducing livelock (because we 215 * emulate more closely the "process to completion" that we have with 216 * fastforwarding) and accounting for the work performed in low level 217 * handling and forwarding. 218 */ 219 220static int residual_burst = 0; 221 222static struct timeval poll_start_t; 223 224void 225ether_pollmore() 226{ 227 struct timeval t; 228 int kern_load; 229 int s = splhigh(); 230 231 if (residual_burst > 0) { 232 schednetisr(NETISR_POLL); 233 /* will run immediately on return, followed by netisrs */ 234 splx(s); 235 return ; 236 } 237 /* here we can account time spent in netisr's in this tick */ 238 microuptime(&t); 239 kern_load = (t.tv_usec - poll_start_t.tv_usec) + 240 (t.tv_sec - poll_start_t.tv_sec)*1000000; /* us */ 241 kern_load = (kern_load * hz) / 10000; /* 0..100 */ 242 if (kern_load > (100 - user_frac)) { /* try decrease ticks */ 243 if (poll_burst > 1) 244 poll_burst--; 245 } else { 246 if (poll_burst < poll_burst_max) 247 poll_burst++; 248 } 249 250 if (need_poll_again) { 251 /* 252 * Last cycle was long and caused us to miss one or more 253 * hardclock ticks. Restart processnig again, but slightly 254 * reduce the burst size to prevent that this happens again. 255 */ 256 need_poll_again--; 257 poll_burst -= (poll_burst / 8); 258 if (poll_burst < 1) 259 poll_burst = 1; 260 schednetisr(NETISR_POLL); 261 } else 262 poll1_active = 0; 263 splx(s); 264} 265 266/* 267 * ether_poll1 is called by schednetisr when appropriate, typically once 268 * per tick. It is called at splnet() so first thing to do is to upgrade to 269 * splimp(), and call all registered handlers. 270 */ 271void 272ether_poll1(void) 273{ 274 static int reg_frac_count; 275 int i, cycles; 276 enum poll_cmd arg = POLL_ONLY; 277 int s=splimp(); 278 mtx_lock(&Giant); 279 280 if (residual_burst == 0) { /* first call in this tick */ 281 microuptime(&poll_start_t); 282 /* 283 * Check that paremeters are consistent with runtime 284 * variables. Some of these tests could be done at sysctl 285 * time, but the savings would be very limited because we 286 * still have to check against reg_frac_count and 287 * poll_each_burst. So, instead of writing separate sysctl 288 * handlers, we do all here. 289 */ 290 291 if (reg_frac > hz) 292 reg_frac = hz; 293 else if (reg_frac < 1) 294 reg_frac = 1; 295 if (reg_frac_count > reg_frac) 296 reg_frac_count = reg_frac - 1; 297 if (reg_frac_count-- == 0) { 298 arg = POLL_AND_CHECK_STATUS; 299 reg_frac_count = reg_frac - 1; 300 } 301 if (poll_burst_max < MIN_POLL_BURST_MAX) 302 poll_burst_max = MIN_POLL_BURST_MAX; 303 else if (poll_burst_max > MAX_POLL_BURST_MAX) 304 poll_burst_max = MAX_POLL_BURST_MAX; 305 306 if (poll_each_burst < 1) 307 poll_each_burst = 1; 308 else if (poll_each_burst > poll_burst_max) 309 poll_each_burst = poll_burst_max; 310 311 residual_burst = poll_burst; 312 } 313 cycles = (residual_burst < poll_each_burst) ? 314 residual_burst : poll_each_burst; 315 residual_burst -= cycles; 316 317 if (polling) { 318 for (i = 0 ; i < poll_handlers ; i++) 319 if (pr[i].handler && (IFF_UP|IFF_RUNNING) == 320 (pr[i].ifp->if_flags & (IFF_UP|IFF_RUNNING)) ) 321 pr[i].handler(pr[i].ifp, arg, cycles); 322 } else { /* unregister */ 323 for (i = 0 ; i < poll_handlers ; i++) { 324 if (pr[i].handler && 325 pr[i].ifp->if_flags & IFF_RUNNING) { 326 pr[i].ifp->if_ipending &= ~IFF_POLLING; 327 pr[i].handler(pr[i].ifp, POLL_DEREGISTER, 1); 328 } 329 pr[i].handler=NULL; 330 } 331 residual_burst = 0; 332 poll_handlers = 0; 333 } 334 /* on -stable, schednetisr(NETISR_POLLMORE); */ 335 mtx_unlock(&Giant); 336 splx(s); 337} 338 339/* 340 * Try to register routine for polling. Returns 1 if successful 341 * (and polling should be enabled), 0 otherwise. 342 * A device is not supposed to register itself multiple times. 343 * 344 * This is called from within the *_intr() function, so we should 345 * probably not need further locking. XXX 346 */ 347int 348ether_poll_register(poll_handler_t *h, struct ifnet *ifp) 349{ 350 int s; 351 352 if (polling == 0) /* polling disabled, cannot register */ 353 return 0; 354 if (h == NULL || ifp == NULL) /* bad arguments */ 355 return 0; 356 if ( !(ifp->if_flags & IFF_UP) ) /* must be up */ 357 return 0; 358 if (ifp->if_ipending & IFF_POLLING) /* already polling */ 359 return 0; 360 361 s = splhigh(); 362 if (poll_handlers >= POLL_LIST_LEN) { 363 /* 364 * List full, cannot register more entries. 365 * This should never happen; if it does, it is probably a 366 * broken driver trying to register multiple times. Checking 367 * this at runtime is expensive, and won't solve the problem 368 * anyways, so just report a few times and then give up. 369 */ 370 static int verbose = 10 ; 371 splx(s); 372 if (verbose >0) { 373 printf("poll handlers list full, " 374 "maybe a broken driver ?\n"); 375 verbose--; 376 } 377 return 0; /* no polling for you */ 378 } 379 380 pr[poll_handlers].handler = h; 381 pr[poll_handlers].ifp = ifp; 382 poll_handlers++; 383 ifp->if_ipending |= IFF_POLLING; 384 splx(s); 385 return 1; /* polling enabled in next call */ 386} 387 388/* 389 * Remove the interface from the list of polling ones. 390 * Normally run by *_stop(). 391 * We allow it being called with IFF_POLLING clear, the 392 * call is sufficiently rare so it is preferable to save the 393 * space for the extra test in each device in exchange of one 394 * additional function call. 395 */ 396int 397ether_poll_deregister(struct ifnet *ifp) 398{ 399 int i; 400 401 mtx_lock(&Giant); 402 if ( !ifp || !(ifp->if_ipending & IFF_POLLING) ) { 403 mtx_unlock(&Giant); 404 return 0; 405 } 406 for (i = 0 ; i < poll_handlers ; i++) 407 if (pr[i].ifp == ifp) /* found it */ 408 break; 409 ifp->if_ipending &= ~IFF_POLLING; /* found or not... */ 410 if (i == poll_handlers) { 411 mtx_unlock(&Giant); 412 printf("ether_poll_deregister: ifp not found!!!\n"); 413 return 0; 414 } 415 poll_handlers--; 416 if (i < poll_handlers) { /* Last entry replaces this one. */ 417 pr[i].handler = pr[poll_handlers].handler; 418 pr[i].ifp = pr[poll_handlers].ifp; 419 } 420 mtx_unlock(&Giant); 421 return 1; 422} 423