subr_hash.c revision 137377
1/* 2 * Copyright (c) 1982, 1986, 1991, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 4. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)kern_subr.c 8.3 (Berkeley) 1/21/94 35 */ 36 37#include <sys/cdefs.h> 38__FBSDID("$FreeBSD: head/sys/kern/kern_subr.c 137377 2004-11-08 06:57:31Z alc $"); 39 40#include "opt_zero.h" 41 42#include <sys/param.h> 43#include <sys/systm.h> 44#include <sys/kernel.h> 45#include <sys/ktr.h> 46#include <sys/limits.h> 47#include <sys/lock.h> 48#include <sys/mutex.h> 49#include <sys/proc.h> 50#include <sys/malloc.h> 51#include <sys/resourcevar.h> 52#include <sys/sched.h> 53#include <sys/sysctl.h> 54#include <sys/vnode.h> 55 56#include <vm/vm.h> 57#include <vm/vm_page.h> 58#include <vm/vm_map.h> 59#ifdef ZERO_COPY_SOCKETS 60#include <vm/vm_param.h> 61#include <vm/vm_object.h> 62#endif 63 64SYSCTL_INT(_kern, KERN_IOV_MAX, iov_max, CTLFLAG_RD, NULL, UIO_MAXIOV, 65 "Maximum number of elements in an I/O vector; sysconf(_SC_IOV_MAX)"); 66 67#ifdef ZERO_COPY_SOCKETS 68/* Declared in uipc_socket.c */ 69extern int so_zero_copy_receive; 70 71static int 72vm_pgmoveco(vm_map_t mapa, vm_offset_t kaddr, vm_offset_t uaddr) 73{ 74 vm_map_t map = mapa; 75 vm_page_t kern_pg, user_pg; 76 vm_object_t uobject; 77 vm_map_entry_t entry; 78 vm_pindex_t upindex; 79 vm_prot_t prot; 80 boolean_t wired; 81 82 /* 83 * First lookup the kernel page. 84 */ 85 kern_pg = PHYS_TO_VM_PAGE(vtophys(kaddr)); 86 /* 87 * XXX The vm object containing kern_pg needs locking. 88 */ 89 if ((vm_map_lookup(&map, uaddr, 90 VM_PROT_WRITE, &entry, &uobject, 91 &upindex, &prot, &wired)) != KERN_SUCCESS) { 92 return(EFAULT); 93 } 94 VM_OBJECT_LOCK(uobject); 95 if ((user_pg = vm_page_lookup(uobject, upindex)) != NULL) { 96 do 97 vm_page_lock_queues(); 98 while (vm_page_sleep_if_busy(user_pg, 1, "vm_pgmoveco")); 99 pmap_remove_all(user_pg); 100 vm_page_free(user_pg); 101 } else 102 vm_page_lock_queues(); 103 if (kern_pg->busy || ((kern_pg->queue - kern_pg->pc) == PQ_FREE) || 104 (kern_pg->hold_count != 0)|| (kern_pg->flags & PG_BUSY)) { 105 printf("vm_pgmoveco: pindex(%lu), busy(%d), PG_BUSY(%d), " 106 "hold(%d) paddr(0x%lx)\n", (u_long)kern_pg->pindex, 107 kern_pg->busy, (kern_pg->flags & PG_BUSY) ? 1 : 0, 108 kern_pg->hold_count, (u_long)kern_pg->phys_addr); 109 if ((kern_pg->queue - kern_pg->pc) == PQ_FREE) 110 panic("vm_pgmoveco: renaming free page"); 111 else 112 panic("vm_pgmoveco: renaming busy page"); 113 } 114 vm_page_rename(kern_pg, uobject, upindex); 115 kern_pg->valid = VM_PAGE_BITS_ALL; 116 vm_page_unlock_queues(); 117 VM_OBJECT_UNLOCK(uobject); 118 vm_map_lookup_done(map, entry); 119 return(KERN_SUCCESS); 120} 121#endif /* ZERO_COPY_SOCKETS */ 122 123int 124uiomove(void *cp, int n, struct uio *uio) 125{ 126 struct thread *td = curthread; 127 struct iovec *iov; 128 u_int cnt; 129 int error = 0; 130 int save = 0; 131 132 KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE, 133 ("uiomove: mode")); 134 KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_td == curthread, 135 ("uiomove proc")); 136 WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, 137 "Calling uiomove()"); 138 139 save = td->td_pflags & TDP_DEADLKTREAT; 140 td->td_pflags |= TDP_DEADLKTREAT; 141 142 while (n > 0 && uio->uio_resid) { 143 iov = uio->uio_iov; 144 cnt = iov->iov_len; 145 if (cnt == 0) { 146 uio->uio_iov++; 147 uio->uio_iovcnt--; 148 continue; 149 } 150 if (cnt > n) 151 cnt = n; 152 153 switch (uio->uio_segflg) { 154 155 case UIO_USERSPACE: 156 if (ticks - PCPU_GET(switchticks) >= hogticks) 157 uio_yield(); 158 if (uio->uio_rw == UIO_READ) 159 error = copyout(cp, iov->iov_base, cnt); 160 else 161 error = copyin(iov->iov_base, cp, cnt); 162 if (error) 163 goto out; 164 break; 165 166 case UIO_SYSSPACE: 167 if (uio->uio_rw == UIO_READ) 168 bcopy(cp, iov->iov_base, cnt); 169 else 170 bcopy(iov->iov_base, cp, cnt); 171 break; 172 case UIO_NOCOPY: 173 break; 174 } 175 iov->iov_base = (char *)iov->iov_base + cnt; 176 iov->iov_len -= cnt; 177 uio->uio_resid -= cnt; 178 uio->uio_offset += cnt; 179 cp = (char *)cp + cnt; 180 n -= cnt; 181 } 182out: 183 if (save == 0) 184 td->td_pflags &= ~TDP_DEADLKTREAT; 185 return (error); 186} 187 188/* 189 * Wrapper for uiomove() that validates the arguments against a known-good 190 * kernel buffer. Currently, uiomove accepts a signed (n) argument, which 191 * is almost definitely a bad thing, so we catch that here as well. We 192 * return a runtime failure, but it might be desirable to generate a runtime 193 * assertion failure instead. 194 */ 195int 196uiomove_frombuf(void *buf, int buflen, struct uio *uio) 197{ 198 unsigned int offset, n; 199 200 if (uio->uio_offset < 0 || uio->uio_resid < 0 || 201 (offset = uio->uio_offset) != uio->uio_offset) 202 return (EINVAL); 203 if (buflen <= 0 || offset >= buflen) 204 return (0); 205 if ((n = buflen - offset) > INT_MAX) 206 return (EINVAL); 207 return (uiomove((char *)buf + offset, n, uio)); 208} 209 210#ifdef ZERO_COPY_SOCKETS 211/* 212 * Experimental support for zero-copy I/O 213 */ 214static int 215userspaceco(void *cp, u_int cnt, struct uio *uio, struct vm_object *obj, 216 int disposable) 217{ 218 struct iovec *iov; 219 int error; 220 221 iov = uio->uio_iov; 222 if (uio->uio_rw == UIO_READ) { 223 if ((so_zero_copy_receive != 0) 224 && (obj != NULL) 225 && ((cnt & PAGE_MASK) == 0) 226 && ((((intptr_t) iov->iov_base) & PAGE_MASK) == 0) 227 && ((uio->uio_offset & PAGE_MASK) == 0) 228 && ((((intptr_t) cp) & PAGE_MASK) == 0) 229 && (obj->type == OBJT_DEFAULT) 230 && (disposable != 0)) { 231 /* SOCKET: use page-trading */ 232 /* 233 * We only want to call vm_pgmoveco() on 234 * disposeable pages, since it gives the 235 * kernel page to the userland process. 236 */ 237 error = vm_pgmoveco(&curproc->p_vmspace->vm_map, 238 (vm_offset_t)cp, (vm_offset_t)iov->iov_base); 239 240 /* 241 * If we get an error back, attempt 242 * to use copyout() instead. The 243 * disposable page should be freed 244 * automatically if we weren't able to move 245 * it into userland. 246 */ 247 if (error != 0) 248 error = copyout(cp, iov->iov_base, cnt); 249 } else { 250 error = copyout(cp, iov->iov_base, cnt); 251 } 252 } else { 253 error = copyin(iov->iov_base, cp, cnt); 254 } 255 return (error); 256} 257 258int 259uiomoveco(void *cp, int n, struct uio *uio, struct vm_object *obj, 260 int disposable) 261{ 262 struct iovec *iov; 263 u_int cnt; 264 int error; 265 266 KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE, 267 ("uiomoveco: mode")); 268 KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_td == curthread, 269 ("uiomoveco proc")); 270 271 while (n > 0 && uio->uio_resid) { 272 iov = uio->uio_iov; 273 cnt = iov->iov_len; 274 if (cnt == 0) { 275 uio->uio_iov++; 276 uio->uio_iovcnt--; 277 continue; 278 } 279 if (cnt > n) 280 cnt = n; 281 282 switch (uio->uio_segflg) { 283 284 case UIO_USERSPACE: 285 if (ticks - PCPU_GET(switchticks) >= hogticks) 286 uio_yield(); 287 288 error = userspaceco(cp, cnt, uio, obj, disposable); 289 290 if (error) 291 return (error); 292 break; 293 294 case UIO_SYSSPACE: 295 if (uio->uio_rw == UIO_READ) 296 bcopy(cp, iov->iov_base, cnt); 297 else 298 bcopy(iov->iov_base, cp, cnt); 299 break; 300 case UIO_NOCOPY: 301 break; 302 } 303 iov->iov_base = (char *)iov->iov_base + cnt; 304 iov->iov_len -= cnt; 305 uio->uio_resid -= cnt; 306 uio->uio_offset += cnt; 307 cp = (char *)cp + cnt; 308 n -= cnt; 309 } 310 return (0); 311} 312#endif /* ZERO_COPY_SOCKETS */ 313 314/* 315 * Give next character to user as result of read. 316 */ 317int 318ureadc(int c, struct uio *uio) 319{ 320 struct iovec *iov; 321 char *iov_base; 322 323again: 324 if (uio->uio_iovcnt == 0 || uio->uio_resid == 0) 325 panic("ureadc"); 326 iov = uio->uio_iov; 327 if (iov->iov_len == 0) { 328 uio->uio_iovcnt--; 329 uio->uio_iov++; 330 goto again; 331 } 332 switch (uio->uio_segflg) { 333 334 case UIO_USERSPACE: 335 if (subyte(iov->iov_base, c) < 0) 336 return (EFAULT); 337 break; 338 339 case UIO_SYSSPACE: 340 iov_base = iov->iov_base; 341 *iov_base = c; 342 iov->iov_base = iov_base; 343 break; 344 345 case UIO_NOCOPY: 346 break; 347 } 348 iov->iov_base = (char *)iov->iov_base + 1; 349 iov->iov_len--; 350 uio->uio_resid--; 351 uio->uio_offset++; 352 return (0); 353} 354 355/* 356 * General routine to allocate a hash table. 357 */ 358void * 359hashinit(int elements, struct malloc_type *type, u_long *hashmask) 360{ 361 long hashsize; 362 LIST_HEAD(generic, generic) *hashtbl; 363 int i; 364 365 if (elements <= 0) 366 panic("hashinit: bad elements"); 367 for (hashsize = 1; hashsize <= elements; hashsize <<= 1) 368 continue; 369 hashsize >>= 1; 370 hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), type, M_WAITOK); 371 for (i = 0; i < hashsize; i++) 372 LIST_INIT(&hashtbl[i]); 373 *hashmask = hashsize - 1; 374 return (hashtbl); 375} 376 377void 378hashdestroy(void *vhashtbl, struct malloc_type *type, u_long hashmask) 379{ 380 LIST_HEAD(generic, generic) *hashtbl, *hp; 381 382 hashtbl = vhashtbl; 383 for (hp = hashtbl; hp <= &hashtbl[hashmask]; hp++) 384 if (!LIST_EMPTY(hp)) 385 panic("hashdestroy: hash not empty"); 386 free(hashtbl, type); 387} 388 389static int primes[] = { 1, 13, 31, 61, 127, 251, 509, 761, 1021, 1531, 2039, 390 2557, 3067, 3583, 4093, 4603, 5119, 5623, 6143, 6653, 391 7159, 7673, 8191, 12281, 16381, 24571, 32749 }; 392#define NPRIMES (sizeof(primes) / sizeof(primes[0])) 393 394/* 395 * General routine to allocate a prime number sized hash table. 396 */ 397void * 398phashinit(int elements, struct malloc_type *type, u_long *nentries) 399{ 400 long hashsize; 401 LIST_HEAD(generic, generic) *hashtbl; 402 int i; 403 404 if (elements <= 0) 405 panic("phashinit: bad elements"); 406 for (i = 1, hashsize = primes[1]; hashsize <= elements;) { 407 i++; 408 if (i == NPRIMES) 409 break; 410 hashsize = primes[i]; 411 } 412 hashsize = primes[i - 1]; 413 hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), type, M_WAITOK); 414 for (i = 0; i < hashsize; i++) 415 LIST_INIT(&hashtbl[i]); 416 *nentries = hashsize; 417 return (hashtbl); 418} 419 420void 421uio_yield(void) 422{ 423 struct thread *td; 424 425 td = curthread; 426 mtx_lock_spin(&sched_lock); 427 DROP_GIANT(); 428 sched_prio(td, td->td_ksegrp->kg_user_pri); /* XXXKSE */ 429 mi_switch(SW_INVOL, NULL); 430 mtx_unlock_spin(&sched_lock); 431 PICKUP_GIANT(); 432} 433 434int 435copyinfrom(const void * __restrict src, void * __restrict dst, size_t len, 436 int seg) 437{ 438 int error = 0; 439 440 switch (seg) { 441 case UIO_USERSPACE: 442 error = copyin(src, dst, len); 443 break; 444 case UIO_SYSSPACE: 445 bcopy(src, dst, len); 446 break; 447 default: 448 panic("copyinfrom: bad seg %d\n", seg); 449 } 450 return (error); 451} 452 453int 454copyinstrfrom(const void * __restrict src, void * __restrict dst, size_t len, 455 size_t * __restrict copied, int seg) 456{ 457 int error = 0; 458 459 switch (seg) { 460 case UIO_USERSPACE: 461 error = copyinstr(src, dst, len, copied); 462 break; 463 case UIO_SYSSPACE: 464 error = copystr(src, dst, len, copied); 465 break; 466 default: 467 panic("copyinstrfrom: bad seg %d\n", seg); 468 } 469 return (error); 470} 471 472int 473copyiniov(struct iovec *iovp, u_int iovcnt, struct iovec **iov, int error) 474{ 475 u_int iovlen; 476 477 *iov = NULL; 478 if (iovcnt > UIO_MAXIOV) 479 return (error); 480 iovlen = iovcnt * sizeof (struct iovec); 481 *iov = malloc(iovlen, M_IOV, M_WAITOK); 482 error = copyin(iovp, *iov, iovlen); 483 if (error) { 484 free(*iov, M_IOV); 485 *iov = NULL; 486 } 487 return (error); 488} 489 490int 491copyinuio(struct iovec *iovp, u_int iovcnt, struct uio **uiop) 492{ 493 struct iovec *iov; 494 struct uio *uio; 495 u_int iovlen; 496 int error, i; 497 498 *uiop = NULL; 499 if (iovcnt > UIO_MAXIOV) 500 return (EINVAL); 501 iovlen = iovcnt * sizeof (struct iovec); 502 uio = malloc(iovlen + sizeof *uio, M_IOV, M_WAITOK); 503 iov = (struct iovec *)(uio + 1); 504 error = copyin(iovp, iov, iovlen); 505 if (error) { 506 free(uio, M_IOV); 507 return (error); 508 } 509 uio->uio_iov = iov; 510 uio->uio_iovcnt = iovcnt; 511 uio->uio_segflg = UIO_USERSPACE; 512 uio->uio_offset = -1; 513 uio->uio_resid = 0; 514 for (i = 0; i < iovcnt; i++) { 515 if (iov->iov_len > INT_MAX - uio->uio_resid) { 516 free(uio, M_IOV); 517 return (EINVAL); 518 } 519 uio->uio_resid += iov->iov_len; 520 iov++; 521 } 522 *uiop = uio; 523 return (0); 524} 525 526struct uio * 527cloneuio(struct uio *uiop) 528{ 529 struct uio *uio; 530 int iovlen; 531 532 iovlen = uiop->uio_iovcnt * sizeof (struct iovec); 533 uio = malloc(iovlen + sizeof *uio, M_IOV, M_WAITOK); 534 *uio = *uiop; 535 uio->uio_iov = (struct iovec *)(uio + 1); 536 bcopy(uiop->uio_iov, uio->uio_iov, iovlen); 537 return (uio); 538} 539