subr_hash.c revision 138424
1/* 2 * Copyright (c) 1982, 1986, 1991, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 4. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)kern_subr.c 8.3 (Berkeley) 1/21/94 35 */ 36 37#include <sys/cdefs.h> 38__FBSDID("$FreeBSD: head/sys/kern/kern_subr.c 138424 2004-12-06 00:43:40Z alc $"); 39 40#include "opt_zero.h" 41 42#include <sys/param.h> 43#include <sys/systm.h> 44#include <sys/kernel.h> 45#include <sys/ktr.h> 46#include <sys/limits.h> 47#include <sys/lock.h> 48#include <sys/mutex.h> 49#include <sys/proc.h> 50#include <sys/malloc.h> 51#include <sys/resourcevar.h> 52#include <sys/sched.h> 53#include <sys/sysctl.h> 54#include <sys/vnode.h> 55 56#include <vm/vm.h> 57#include <vm/vm_page.h> 58#include <vm/vm_map.h> 59#ifdef ZERO_COPY_SOCKETS 60#include <vm/vm_param.h> 61#include <vm/vm_object.h> 62#endif 63 64SYSCTL_INT(_kern, KERN_IOV_MAX, iov_max, CTLFLAG_RD, NULL, UIO_MAXIOV, 65 "Maximum number of elements in an I/O vector; sysconf(_SC_IOV_MAX)"); 66 67#ifdef ZERO_COPY_SOCKETS 68/* Declared in uipc_socket.c */ 69extern int so_zero_copy_receive; 70 71static int 72vm_pgmoveco(vm_map_t mapa, vm_offset_t kaddr, vm_offset_t uaddr) 73{ 74 vm_map_t map = mapa; 75 vm_page_t kern_pg, user_pg; 76 vm_object_t uobject; 77 vm_map_entry_t entry; 78 vm_pindex_t upindex; 79 vm_prot_t prot; 80 boolean_t wired; 81 82 /* 83 * First lookup the kernel page. 84 */ 85 kern_pg = PHYS_TO_VM_PAGE(vtophys(kaddr)); 86 87 if ((vm_map_lookup(&map, uaddr, 88 VM_PROT_WRITE, &entry, &uobject, 89 &upindex, &prot, &wired)) != KERN_SUCCESS) { 90 return(EFAULT); 91 } 92 VM_OBJECT_LOCK(uobject); 93 if ((user_pg = vm_page_lookup(uobject, upindex)) != NULL) { 94 do 95 vm_page_lock_queues(); 96 while (vm_page_sleep_if_busy(user_pg, 1, "vm_pgmoveco")); 97 pmap_remove_all(user_pg); 98 vm_page_free(user_pg); 99 } else 100 vm_page_lock_queues(); 101 if (kern_pg->busy || ((kern_pg->queue - kern_pg->pc) == PQ_FREE) || 102 (kern_pg->hold_count != 0)|| (kern_pg->flags & PG_BUSY)) { 103 printf("vm_pgmoveco: pindex(%lu), busy(%d), PG_BUSY(%d), " 104 "hold(%d) paddr(0x%lx)\n", (u_long)kern_pg->pindex, 105 kern_pg->busy, (kern_pg->flags & PG_BUSY) ? 1 : 0, 106 kern_pg->hold_count, (u_long)kern_pg->phys_addr); 107 if ((kern_pg->queue - kern_pg->pc) == PQ_FREE) 108 panic("vm_pgmoveco: renaming free page"); 109 else 110 panic("vm_pgmoveco: renaming busy page"); 111 } 112 vm_page_insert(kern_pg, uobject, upindex); 113 vm_page_dirty(kern_pg); 114 kern_pg->valid = VM_PAGE_BITS_ALL; 115 vm_page_unlock_queues(); 116 VM_OBJECT_UNLOCK(uobject); 117 vm_map_lookup_done(map, entry); 118 return(KERN_SUCCESS); 119} 120#endif /* ZERO_COPY_SOCKETS */ 121 122int 123uiomove(void *cp, int n, struct uio *uio) 124{ 125 struct thread *td = curthread; 126 struct iovec *iov; 127 u_int cnt; 128 int error = 0; 129 int save = 0; 130 131 KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE, 132 ("uiomove: mode")); 133 KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_td == curthread, 134 ("uiomove proc")); 135 WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, 136 "Calling uiomove()"); 137 138 save = td->td_pflags & TDP_DEADLKTREAT; 139 td->td_pflags |= TDP_DEADLKTREAT; 140 141 while (n > 0 && uio->uio_resid) { 142 iov = uio->uio_iov; 143 cnt = iov->iov_len; 144 if (cnt == 0) { 145 uio->uio_iov++; 146 uio->uio_iovcnt--; 147 continue; 148 } 149 if (cnt > n) 150 cnt = n; 151 152 switch (uio->uio_segflg) { 153 154 case UIO_USERSPACE: 155 if (ticks - PCPU_GET(switchticks) >= hogticks) 156 uio_yield(); 157 if (uio->uio_rw == UIO_READ) 158 error = copyout(cp, iov->iov_base, cnt); 159 else 160 error = copyin(iov->iov_base, cp, cnt); 161 if (error) 162 goto out; 163 break; 164 165 case UIO_SYSSPACE: 166 if (uio->uio_rw == UIO_READ) 167 bcopy(cp, iov->iov_base, cnt); 168 else 169 bcopy(iov->iov_base, cp, cnt); 170 break; 171 case UIO_NOCOPY: 172 break; 173 } 174 iov->iov_base = (char *)iov->iov_base + cnt; 175 iov->iov_len -= cnt; 176 uio->uio_resid -= cnt; 177 uio->uio_offset += cnt; 178 cp = (char *)cp + cnt; 179 n -= cnt; 180 } 181out: 182 if (save == 0) 183 td->td_pflags &= ~TDP_DEADLKTREAT; 184 return (error); 185} 186 187/* 188 * Wrapper for uiomove() that validates the arguments against a known-good 189 * kernel buffer. Currently, uiomove accepts a signed (n) argument, which 190 * is almost definitely a bad thing, so we catch that here as well. We 191 * return a runtime failure, but it might be desirable to generate a runtime 192 * assertion failure instead. 193 */ 194int 195uiomove_frombuf(void *buf, int buflen, struct uio *uio) 196{ 197 unsigned int offset, n; 198 199 if (uio->uio_offset < 0 || uio->uio_resid < 0 || 200 (offset = uio->uio_offset) != uio->uio_offset) 201 return (EINVAL); 202 if (buflen <= 0 || offset >= buflen) 203 return (0); 204 if ((n = buflen - offset) > INT_MAX) 205 return (EINVAL); 206 return (uiomove((char *)buf + offset, n, uio)); 207} 208 209#ifdef ZERO_COPY_SOCKETS 210/* 211 * Experimental support for zero-copy I/O 212 */ 213static int 214userspaceco(void *cp, u_int cnt, struct uio *uio, struct vm_object *obj, 215 int disposable) 216{ 217 struct iovec *iov; 218 int error; 219 220 iov = uio->uio_iov; 221 if (uio->uio_rw == UIO_READ) { 222 if ((so_zero_copy_receive != 0) 223 && (obj == NULL) 224 && ((cnt & PAGE_MASK) == 0) 225 && ((((intptr_t) iov->iov_base) & PAGE_MASK) == 0) 226 && ((uio->uio_offset & PAGE_MASK) == 0) 227 && ((((intptr_t) cp) & PAGE_MASK) == 0) 228 && (disposable != 0)) { 229 /* SOCKET: use page-trading */ 230 /* 231 * We only want to call vm_pgmoveco() on 232 * disposeable pages, since it gives the 233 * kernel page to the userland process. 234 */ 235 error = vm_pgmoveco(&curproc->p_vmspace->vm_map, 236 (vm_offset_t)cp, (vm_offset_t)iov->iov_base); 237 238 /* 239 * If we get an error back, attempt 240 * to use copyout() instead. The 241 * disposable page should be freed 242 * automatically if we weren't able to move 243 * it into userland. 244 */ 245 if (error != 0) 246 error = copyout(cp, iov->iov_base, cnt); 247 } else { 248 error = copyout(cp, iov->iov_base, cnt); 249 } 250 } else { 251 error = copyin(iov->iov_base, cp, cnt); 252 } 253 return (error); 254} 255 256int 257uiomoveco(void *cp, int n, struct uio *uio, struct vm_object *obj, 258 int disposable) 259{ 260 struct iovec *iov; 261 u_int cnt; 262 int error; 263 264 KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE, 265 ("uiomoveco: mode")); 266 KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_td == curthread, 267 ("uiomoveco proc")); 268 269 while (n > 0 && uio->uio_resid) { 270 iov = uio->uio_iov; 271 cnt = iov->iov_len; 272 if (cnt == 0) { 273 uio->uio_iov++; 274 uio->uio_iovcnt--; 275 continue; 276 } 277 if (cnt > n) 278 cnt = n; 279 280 switch (uio->uio_segflg) { 281 282 case UIO_USERSPACE: 283 if (ticks - PCPU_GET(switchticks) >= hogticks) 284 uio_yield(); 285 286 error = userspaceco(cp, cnt, uio, obj, disposable); 287 288 if (error) 289 return (error); 290 break; 291 292 case UIO_SYSSPACE: 293 if (uio->uio_rw == UIO_READ) 294 bcopy(cp, iov->iov_base, cnt); 295 else 296 bcopy(iov->iov_base, cp, cnt); 297 break; 298 case UIO_NOCOPY: 299 break; 300 } 301 iov->iov_base = (char *)iov->iov_base + cnt; 302 iov->iov_len -= cnt; 303 uio->uio_resid -= cnt; 304 uio->uio_offset += cnt; 305 cp = (char *)cp + cnt; 306 n -= cnt; 307 } 308 return (0); 309} 310#endif /* ZERO_COPY_SOCKETS */ 311 312/* 313 * Give next character to user as result of read. 314 */ 315int 316ureadc(int c, struct uio *uio) 317{ 318 struct iovec *iov; 319 char *iov_base; 320 321again: 322 if (uio->uio_iovcnt == 0 || uio->uio_resid == 0) 323 panic("ureadc"); 324 iov = uio->uio_iov; 325 if (iov->iov_len == 0) { 326 uio->uio_iovcnt--; 327 uio->uio_iov++; 328 goto again; 329 } 330 switch (uio->uio_segflg) { 331 332 case UIO_USERSPACE: 333 if (subyte(iov->iov_base, c) < 0) 334 return (EFAULT); 335 break; 336 337 case UIO_SYSSPACE: 338 iov_base = iov->iov_base; 339 *iov_base = c; 340 iov->iov_base = iov_base; 341 break; 342 343 case UIO_NOCOPY: 344 break; 345 } 346 iov->iov_base = (char *)iov->iov_base + 1; 347 iov->iov_len--; 348 uio->uio_resid--; 349 uio->uio_offset++; 350 return (0); 351} 352 353/* 354 * General routine to allocate a hash table. 355 */ 356void * 357hashinit(int elements, struct malloc_type *type, u_long *hashmask) 358{ 359 long hashsize; 360 LIST_HEAD(generic, generic) *hashtbl; 361 int i; 362 363 if (elements <= 0) 364 panic("hashinit: bad elements"); 365 for (hashsize = 1; hashsize <= elements; hashsize <<= 1) 366 continue; 367 hashsize >>= 1; 368 hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), type, M_WAITOK); 369 for (i = 0; i < hashsize; i++) 370 LIST_INIT(&hashtbl[i]); 371 *hashmask = hashsize - 1; 372 return (hashtbl); 373} 374 375void 376hashdestroy(void *vhashtbl, struct malloc_type *type, u_long hashmask) 377{ 378 LIST_HEAD(generic, generic) *hashtbl, *hp; 379 380 hashtbl = vhashtbl; 381 for (hp = hashtbl; hp <= &hashtbl[hashmask]; hp++) 382 if (!LIST_EMPTY(hp)) 383 panic("hashdestroy: hash not empty"); 384 free(hashtbl, type); 385} 386 387static int primes[] = { 1, 13, 31, 61, 127, 251, 509, 761, 1021, 1531, 2039, 388 2557, 3067, 3583, 4093, 4603, 5119, 5623, 6143, 6653, 389 7159, 7673, 8191, 12281, 16381, 24571, 32749 }; 390#define NPRIMES (sizeof(primes) / sizeof(primes[0])) 391 392/* 393 * General routine to allocate a prime number sized hash table. 394 */ 395void * 396phashinit(int elements, struct malloc_type *type, u_long *nentries) 397{ 398 long hashsize; 399 LIST_HEAD(generic, generic) *hashtbl; 400 int i; 401 402 if (elements <= 0) 403 panic("phashinit: bad elements"); 404 for (i = 1, hashsize = primes[1]; hashsize <= elements;) { 405 i++; 406 if (i == NPRIMES) 407 break; 408 hashsize = primes[i]; 409 } 410 hashsize = primes[i - 1]; 411 hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), type, M_WAITOK); 412 for (i = 0; i < hashsize; i++) 413 LIST_INIT(&hashtbl[i]); 414 *nentries = hashsize; 415 return (hashtbl); 416} 417 418void 419uio_yield(void) 420{ 421 struct thread *td; 422 423 td = curthread; 424 mtx_lock_spin(&sched_lock); 425 DROP_GIANT(); 426 sched_prio(td, td->td_ksegrp->kg_user_pri); /* XXXKSE */ 427 mi_switch(SW_INVOL, NULL); 428 mtx_unlock_spin(&sched_lock); 429 PICKUP_GIANT(); 430} 431 432int 433copyinfrom(const void * __restrict src, void * __restrict dst, size_t len, 434 int seg) 435{ 436 int error = 0; 437 438 switch (seg) { 439 case UIO_USERSPACE: 440 error = copyin(src, dst, len); 441 break; 442 case UIO_SYSSPACE: 443 bcopy(src, dst, len); 444 break; 445 default: 446 panic("copyinfrom: bad seg %d\n", seg); 447 } 448 return (error); 449} 450 451int 452copyinstrfrom(const void * __restrict src, void * __restrict dst, size_t len, 453 size_t * __restrict copied, int seg) 454{ 455 int error = 0; 456 457 switch (seg) { 458 case UIO_USERSPACE: 459 error = copyinstr(src, dst, len, copied); 460 break; 461 case UIO_SYSSPACE: 462 error = copystr(src, dst, len, copied); 463 break; 464 default: 465 panic("copyinstrfrom: bad seg %d\n", seg); 466 } 467 return (error); 468} 469 470int 471copyiniov(struct iovec *iovp, u_int iovcnt, struct iovec **iov, int error) 472{ 473 u_int iovlen; 474 475 *iov = NULL; 476 if (iovcnt > UIO_MAXIOV) 477 return (error); 478 iovlen = iovcnt * sizeof (struct iovec); 479 *iov = malloc(iovlen, M_IOV, M_WAITOK); 480 error = copyin(iovp, *iov, iovlen); 481 if (error) { 482 free(*iov, M_IOV); 483 *iov = NULL; 484 } 485 return (error); 486} 487 488int 489copyinuio(struct iovec *iovp, u_int iovcnt, struct uio **uiop) 490{ 491 struct iovec *iov; 492 struct uio *uio; 493 u_int iovlen; 494 int error, i; 495 496 *uiop = NULL; 497 if (iovcnt > UIO_MAXIOV) 498 return (EINVAL); 499 iovlen = iovcnt * sizeof (struct iovec); 500 uio = malloc(iovlen + sizeof *uio, M_IOV, M_WAITOK); 501 iov = (struct iovec *)(uio + 1); 502 error = copyin(iovp, iov, iovlen); 503 if (error) { 504 free(uio, M_IOV); 505 return (error); 506 } 507 uio->uio_iov = iov; 508 uio->uio_iovcnt = iovcnt; 509 uio->uio_segflg = UIO_USERSPACE; 510 uio->uio_offset = -1; 511 uio->uio_resid = 0; 512 for (i = 0; i < iovcnt; i++) { 513 if (iov->iov_len > INT_MAX - uio->uio_resid) { 514 free(uio, M_IOV); 515 return (EINVAL); 516 } 517 uio->uio_resid += iov->iov_len; 518 iov++; 519 } 520 *uiop = uio; 521 return (0); 522} 523 524struct uio * 525cloneuio(struct uio *uiop) 526{ 527 struct uio *uio; 528 int iovlen; 529 530 iovlen = uiop->uio_iovcnt * sizeof (struct iovec); 531 uio = malloc(iovlen + sizeof *uio, M_IOV, M_WAITOK); 532 *uio = *uiop; 533 uio->uio_iov = (struct iovec *)(uio + 1); 534 bcopy(uiop->uio_iov, uio->uio_iov, iovlen); 535 return (uio); 536} 537