subr_hash.c revision 137244
1/* 2 * Copyright (c) 1982, 1986, 1991, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 4. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)kern_subr.c 8.3 (Berkeley) 1/21/94 35 */ 36 37#include <sys/cdefs.h> 38__FBSDID("$FreeBSD: head/sys/kern/kern_subr.c 137244 2004-11-05 06:52:29Z alc $"); 39 40#include "opt_zero.h" 41 42#include <sys/param.h> 43#include <sys/systm.h> 44#include <sys/kernel.h> 45#include <sys/ktr.h> 46#include <sys/limits.h> 47#include <sys/lock.h> 48#include <sys/mutex.h> 49#include <sys/proc.h> 50#include <sys/malloc.h> 51#include <sys/resourcevar.h> 52#include <sys/sched.h> 53#include <sys/sysctl.h> 54#include <sys/vnode.h> 55 56#include <vm/vm.h> 57#include <vm/vm_page.h> 58#include <vm/vm_map.h> 59#ifdef ZERO_COPY_SOCKETS 60#include <vm/vm_param.h> 61#include <vm/vm_object.h> 62#endif 63 64SYSCTL_INT(_kern, KERN_IOV_MAX, iov_max, CTLFLAG_RD, NULL, UIO_MAXIOV, 65 "Maximum number of elements in an I/O vector; sysconf(_SC_IOV_MAX)"); 66 67#ifdef ZERO_COPY_SOCKETS 68/* Declared in uipc_socket.c */ 69extern int so_zero_copy_receive; 70 71static int 72vm_pgmoveco(vm_map_t mapa, vm_object_t srcobj, vm_offset_t kaddr, 73 vm_offset_t uaddr) 74{ 75 vm_map_t map = mapa; 76 vm_page_t kern_pg, user_pg; 77 vm_object_t uobject; 78 vm_map_entry_t entry; 79 vm_pindex_t upindex; 80 vm_prot_t prot; 81 boolean_t wired; 82 83 /* 84 * First lookup the kernel page. 85 */ 86 kern_pg = PHYS_TO_VM_PAGE(vtophys(kaddr)); 87 /* 88 * XXX The vm object containing kern_pg needs locking. 89 */ 90 if ((vm_map_lookup(&map, uaddr, 91 VM_PROT_WRITE, &entry, &uobject, 92 &upindex, &prot, &wired)) != KERN_SUCCESS) { 93 return(EFAULT); 94 } 95 VM_OBJECT_LOCK(uobject); 96 if ((user_pg = vm_page_lookup(uobject, upindex)) != NULL) { 97 do 98 vm_page_lock_queues(); 99 while (vm_page_sleep_if_busy(user_pg, 1, "vm_pgmoveco")); 100 pmap_remove_all(user_pg); 101 vm_page_free(user_pg); 102 } else 103 vm_page_lock_queues(); 104 if (kern_pg->busy || ((kern_pg->queue - kern_pg->pc) == PQ_FREE) || 105 (kern_pg->hold_count != 0)|| (kern_pg->flags & PG_BUSY)) { 106 printf("vm_pgmoveco: pindex(%lu), busy(%d), PG_BUSY(%d), " 107 "hold(%d) paddr(0x%lx)\n", (u_long)kern_pg->pindex, 108 kern_pg->busy, (kern_pg->flags & PG_BUSY) ? 1 : 0, 109 kern_pg->hold_count, (u_long)kern_pg->phys_addr); 110 if ((kern_pg->queue - kern_pg->pc) == PQ_FREE) 111 panic("vm_pgmoveco: renaming free page"); 112 else 113 panic("vm_pgmoveco: renaming busy page"); 114 } 115 vm_page_rename(kern_pg, uobject, upindex); 116 kern_pg->valid = VM_PAGE_BITS_ALL; 117 vm_page_unlock_queues(); 118 VM_OBJECT_UNLOCK(uobject); 119 vm_map_lookup_done(map, entry); 120 return(KERN_SUCCESS); 121} 122#endif /* ZERO_COPY_SOCKETS */ 123 124int 125uiomove(void *cp, int n, struct uio *uio) 126{ 127 struct thread *td = curthread; 128 struct iovec *iov; 129 u_int cnt; 130 int error = 0; 131 int save = 0; 132 133 KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE, 134 ("uiomove: mode")); 135 KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_td == curthread, 136 ("uiomove proc")); 137 WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, 138 "Calling uiomove()"); 139 140 save = td->td_pflags & TDP_DEADLKTREAT; 141 td->td_pflags |= TDP_DEADLKTREAT; 142 143 while (n > 0 && uio->uio_resid) { 144 iov = uio->uio_iov; 145 cnt = iov->iov_len; 146 if (cnt == 0) { 147 uio->uio_iov++; 148 uio->uio_iovcnt--; 149 continue; 150 } 151 if (cnt > n) 152 cnt = n; 153 154 switch (uio->uio_segflg) { 155 156 case UIO_USERSPACE: 157 if (ticks - PCPU_GET(switchticks) >= hogticks) 158 uio_yield(); 159 if (uio->uio_rw == UIO_READ) 160 error = copyout(cp, iov->iov_base, cnt); 161 else 162 error = copyin(iov->iov_base, cp, cnt); 163 if (error) 164 goto out; 165 break; 166 167 case UIO_SYSSPACE: 168 if (uio->uio_rw == UIO_READ) 169 bcopy(cp, iov->iov_base, cnt); 170 else 171 bcopy(iov->iov_base, cp, cnt); 172 break; 173 case UIO_NOCOPY: 174 break; 175 } 176 iov->iov_base = (char *)iov->iov_base + cnt; 177 iov->iov_len -= cnt; 178 uio->uio_resid -= cnt; 179 uio->uio_offset += cnt; 180 cp = (char *)cp + cnt; 181 n -= cnt; 182 } 183out: 184 if (save == 0) 185 td->td_pflags &= ~TDP_DEADLKTREAT; 186 return (error); 187} 188 189/* 190 * Wrapper for uiomove() that validates the arguments against a known-good 191 * kernel buffer. Currently, uiomove accepts a signed (n) argument, which 192 * is almost definitely a bad thing, so we catch that here as well. We 193 * return a runtime failure, but it might be desirable to generate a runtime 194 * assertion failure instead. 195 */ 196int 197uiomove_frombuf(void *buf, int buflen, struct uio *uio) 198{ 199 unsigned int offset, n; 200 201 if (uio->uio_offset < 0 || uio->uio_resid < 0 || 202 (offset = uio->uio_offset) != uio->uio_offset) 203 return (EINVAL); 204 if (buflen <= 0 || offset >= buflen) 205 return (0); 206 if ((n = buflen - offset) > INT_MAX) 207 return (EINVAL); 208 return (uiomove((char *)buf + offset, n, uio)); 209} 210 211#ifdef ZERO_COPY_SOCKETS 212/* 213 * Experimental support for zero-copy I/O 214 */ 215static int 216userspaceco(void *cp, u_int cnt, struct uio *uio, struct vm_object *obj, 217 int disposable) 218{ 219 struct iovec *iov; 220 int error; 221 222 iov = uio->uio_iov; 223 if (uio->uio_rw == UIO_READ) { 224 if ((so_zero_copy_receive != 0) 225 && (obj != NULL) 226 && ((cnt & PAGE_MASK) == 0) 227 && ((((intptr_t) iov->iov_base) & PAGE_MASK) == 0) 228 && ((uio->uio_offset & PAGE_MASK) == 0) 229 && ((((intptr_t) cp) & PAGE_MASK) == 0) 230 && (obj->type == OBJT_DEFAULT) 231 && (disposable != 0)) { 232 /* SOCKET: use page-trading */ 233 /* 234 * We only want to call vm_pgmoveco() on 235 * disposeable pages, since it gives the 236 * kernel page to the userland process. 237 */ 238 error = vm_pgmoveco(&curproc->p_vmspace->vm_map, 239 obj, (vm_offset_t)cp, 240 (vm_offset_t)iov->iov_base); 241 242 /* 243 * If we get an error back, attempt 244 * to use copyout() instead. The 245 * disposable page should be freed 246 * automatically if we weren't able to move 247 * it into userland. 248 */ 249 if (error != 0) 250 error = copyout(cp, iov->iov_base, cnt); 251 } else { 252 error = copyout(cp, iov->iov_base, cnt); 253 } 254 } else { 255 error = copyin(iov->iov_base, cp, cnt); 256 } 257 return (error); 258} 259 260int 261uiomoveco(void *cp, int n, struct uio *uio, struct vm_object *obj, 262 int disposable) 263{ 264 struct iovec *iov; 265 u_int cnt; 266 int error; 267 268 KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE, 269 ("uiomoveco: mode")); 270 KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_td == curthread, 271 ("uiomoveco proc")); 272 273 while (n > 0 && uio->uio_resid) { 274 iov = uio->uio_iov; 275 cnt = iov->iov_len; 276 if (cnt == 0) { 277 uio->uio_iov++; 278 uio->uio_iovcnt--; 279 continue; 280 } 281 if (cnt > n) 282 cnt = n; 283 284 switch (uio->uio_segflg) { 285 286 case UIO_USERSPACE: 287 if (ticks - PCPU_GET(switchticks) >= hogticks) 288 uio_yield(); 289 290 error = userspaceco(cp, cnt, uio, obj, disposable); 291 292 if (error) 293 return (error); 294 break; 295 296 case UIO_SYSSPACE: 297 if (uio->uio_rw == UIO_READ) 298 bcopy(cp, iov->iov_base, cnt); 299 else 300 bcopy(iov->iov_base, cp, cnt); 301 break; 302 case UIO_NOCOPY: 303 break; 304 } 305 iov->iov_base = (char *)iov->iov_base + cnt; 306 iov->iov_len -= cnt; 307 uio->uio_resid -= cnt; 308 uio->uio_offset += cnt; 309 cp = (char *)cp + cnt; 310 n -= cnt; 311 } 312 return (0); 313} 314#endif /* ZERO_COPY_SOCKETS */ 315 316/* 317 * Give next character to user as result of read. 318 */ 319int 320ureadc(int c, struct uio *uio) 321{ 322 struct iovec *iov; 323 char *iov_base; 324 325again: 326 if (uio->uio_iovcnt == 0 || uio->uio_resid == 0) 327 panic("ureadc"); 328 iov = uio->uio_iov; 329 if (iov->iov_len == 0) { 330 uio->uio_iovcnt--; 331 uio->uio_iov++; 332 goto again; 333 } 334 switch (uio->uio_segflg) { 335 336 case UIO_USERSPACE: 337 if (subyte(iov->iov_base, c) < 0) 338 return (EFAULT); 339 break; 340 341 case UIO_SYSSPACE: 342 iov_base = iov->iov_base; 343 *iov_base = c; 344 iov->iov_base = iov_base; 345 break; 346 347 case UIO_NOCOPY: 348 break; 349 } 350 iov->iov_base = (char *)iov->iov_base + 1; 351 iov->iov_len--; 352 uio->uio_resid--; 353 uio->uio_offset++; 354 return (0); 355} 356 357/* 358 * General routine to allocate a hash table. 359 */ 360void * 361hashinit(int elements, struct malloc_type *type, u_long *hashmask) 362{ 363 long hashsize; 364 LIST_HEAD(generic, generic) *hashtbl; 365 int i; 366 367 if (elements <= 0) 368 panic("hashinit: bad elements"); 369 for (hashsize = 1; hashsize <= elements; hashsize <<= 1) 370 continue; 371 hashsize >>= 1; 372 hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), type, M_WAITOK); 373 for (i = 0; i < hashsize; i++) 374 LIST_INIT(&hashtbl[i]); 375 *hashmask = hashsize - 1; 376 return (hashtbl); 377} 378 379void 380hashdestroy(void *vhashtbl, struct malloc_type *type, u_long hashmask) 381{ 382 LIST_HEAD(generic, generic) *hashtbl, *hp; 383 384 hashtbl = vhashtbl; 385 for (hp = hashtbl; hp <= &hashtbl[hashmask]; hp++) 386 if (!LIST_EMPTY(hp)) 387 panic("hashdestroy: hash not empty"); 388 free(hashtbl, type); 389} 390 391static int primes[] = { 1, 13, 31, 61, 127, 251, 509, 761, 1021, 1531, 2039, 392 2557, 3067, 3583, 4093, 4603, 5119, 5623, 6143, 6653, 393 7159, 7673, 8191, 12281, 16381, 24571, 32749 }; 394#define NPRIMES (sizeof(primes) / sizeof(primes[0])) 395 396/* 397 * General routine to allocate a prime number sized hash table. 398 */ 399void * 400phashinit(int elements, struct malloc_type *type, u_long *nentries) 401{ 402 long hashsize; 403 LIST_HEAD(generic, generic) *hashtbl; 404 int i; 405 406 if (elements <= 0) 407 panic("phashinit: bad elements"); 408 for (i = 1, hashsize = primes[1]; hashsize <= elements;) { 409 i++; 410 if (i == NPRIMES) 411 break; 412 hashsize = primes[i]; 413 } 414 hashsize = primes[i - 1]; 415 hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), type, M_WAITOK); 416 for (i = 0; i < hashsize; i++) 417 LIST_INIT(&hashtbl[i]); 418 *nentries = hashsize; 419 return (hashtbl); 420} 421 422void 423uio_yield(void) 424{ 425 struct thread *td; 426 427 td = curthread; 428 mtx_lock_spin(&sched_lock); 429 DROP_GIANT(); 430 sched_prio(td, td->td_ksegrp->kg_user_pri); /* XXXKSE */ 431 mi_switch(SW_INVOL, NULL); 432 mtx_unlock_spin(&sched_lock); 433 PICKUP_GIANT(); 434} 435 436int 437copyinfrom(const void * __restrict src, void * __restrict dst, size_t len, 438 int seg) 439{ 440 int error = 0; 441 442 switch (seg) { 443 case UIO_USERSPACE: 444 error = copyin(src, dst, len); 445 break; 446 case UIO_SYSSPACE: 447 bcopy(src, dst, len); 448 break; 449 default: 450 panic("copyinfrom: bad seg %d\n", seg); 451 } 452 return (error); 453} 454 455int 456copyinstrfrom(const void * __restrict src, void * __restrict dst, size_t len, 457 size_t * __restrict copied, int seg) 458{ 459 int error = 0; 460 461 switch (seg) { 462 case UIO_USERSPACE: 463 error = copyinstr(src, dst, len, copied); 464 break; 465 case UIO_SYSSPACE: 466 error = copystr(src, dst, len, copied); 467 break; 468 default: 469 panic("copyinstrfrom: bad seg %d\n", seg); 470 } 471 return (error); 472} 473 474int 475copyiniov(struct iovec *iovp, u_int iovcnt, struct iovec **iov, int error) 476{ 477 u_int iovlen; 478 479 *iov = NULL; 480 if (iovcnt > UIO_MAXIOV) 481 return (error); 482 iovlen = iovcnt * sizeof (struct iovec); 483 *iov = malloc(iovlen, M_IOV, M_WAITOK); 484 error = copyin(iovp, *iov, iovlen); 485 if (error) { 486 free(*iov, M_IOV); 487 *iov = NULL; 488 } 489 return (error); 490} 491 492int 493copyinuio(struct iovec *iovp, u_int iovcnt, struct uio **uiop) 494{ 495 struct iovec *iov; 496 struct uio *uio; 497 u_int iovlen; 498 int error, i; 499 500 *uiop = NULL; 501 if (iovcnt > UIO_MAXIOV) 502 return (EINVAL); 503 iovlen = iovcnt * sizeof (struct iovec); 504 uio = malloc(iovlen + sizeof *uio, M_IOV, M_WAITOK); 505 iov = (struct iovec *)(uio + 1); 506 error = copyin(iovp, iov, iovlen); 507 if (error) { 508 free(uio, M_IOV); 509 return (error); 510 } 511 uio->uio_iov = iov; 512 uio->uio_iovcnt = iovcnt; 513 uio->uio_segflg = UIO_USERSPACE; 514 uio->uio_offset = -1; 515 uio->uio_resid = 0; 516 for (i = 0; i < iovcnt; i++) { 517 if (iov->iov_len > INT_MAX - uio->uio_resid) { 518 free(uio, M_IOV); 519 return (EINVAL); 520 } 521 uio->uio_resid += iov->iov_len; 522 iov++; 523 } 524 *uiop = uio; 525 return (0); 526} 527 528struct uio * 529cloneuio(struct uio *uiop) 530{ 531 struct uio *uio; 532 int iovlen; 533 534 iovlen = uiop->uio_iovcnt * sizeof (struct iovec); 535 uio = malloc(iovlen + sizeof *uio, M_IOV, M_WAITOK); 536 *uio = *uiop; 537 uio->uio_iov = (struct iovec *)(uio + 1); 538 bcopy(uiop->uio_iov, uio->uio_iov, iovlen); 539 return (uio); 540} 541