subr_hash.c revision 111739
1/* 2 * Copyright (c) 1982, 1986, 1991, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * @(#)kern_subr.c 8.3 (Berkeley) 1/21/94 39 * $FreeBSD: head/sys/kern/kern_subr.c 111739 2003-03-02 15:29:13Z des $ 40 */ 41 42#include "opt_zero.h" 43 44#include <sys/param.h> 45#include <sys/systm.h> 46#include <sys/kernel.h> 47#include <sys/ktr.h> 48#include <sys/lock.h> 49#include <sys/mutex.h> 50#include <sys/proc.h> 51#include <sys/malloc.h> 52#include <sys/resourcevar.h> 53#include <sys/sched.h> 54#include <sys/sysctl.h> 55#include <sys/vnode.h> 56 57#include <vm/vm.h> 58#include <vm/vm_page.h> 59#include <vm/vm_map.h> 60#ifdef ZERO_COPY_SOCKETS 61#include <vm/vm_param.h> 62#endif 63#if defined(ZERO_COPY_SOCKETS) || defined(ENABLE_VFS_IOOPT) 64#include <vm/vm_object.h> 65#endif 66 67SYSCTL_INT(_kern, KERN_IOV_MAX, iov_max, CTLFLAG_RD, NULL, UIO_MAXIOV, 68 "Maximum number of elements in an I/O vector; sysconf(_SC_IOV_MAX)"); 69 70#ifdef ZERO_COPY_SOCKETS 71/* Declared in uipc_socket.c */ 72extern int so_zero_copy_receive; 73 74static int 75vm_pgmoveco(vm_map_t mapa, vm_object_t srcobj, vm_offset_t kaddr, 76 vm_offset_t uaddr) 77{ 78 vm_map_t map = mapa; 79 vm_page_t kern_pg, user_pg; 80 vm_object_t uobject; 81 vm_map_entry_t entry; 82 vm_pindex_t upindex, kpindex; 83 vm_prot_t prot; 84 boolean_t wired; 85 86 /* 87 * First lookup the kernel page. 88 */ 89 kern_pg = PHYS_TO_VM_PAGE(vtophys(kaddr)); 90 91 if ((vm_map_lookup(&map, uaddr, 92 VM_PROT_READ, &entry, &uobject, 93 &upindex, &prot, &wired)) != KERN_SUCCESS) { 94 return(EFAULT); 95 } 96 if ((user_pg = vm_page_lookup(uobject, upindex)) != NULL) { 97 do 98 vm_page_lock_queues(); 99 while (vm_page_sleep_if_busy(user_pg, 1, "vm_pgmoveco")); 100 vm_page_busy(user_pg); 101 pmap_remove_all(user_pg); 102 vm_page_free(user_pg); 103 } else 104 vm_page_lock_queues(); 105 if (kern_pg->busy || ((kern_pg->queue - kern_pg->pc) == PQ_FREE) || 106 (kern_pg->hold_count != 0)|| (kern_pg->flags & PG_BUSY)) { 107 printf("vm_pgmoveco: pindex(%lu), busy(%d), PG_BUSY(%d), " 108 "hold(%d) paddr(0x%lx)\n", (u_long)kern_pg->pindex, 109 kern_pg->busy, (kern_pg->flags & PG_BUSY) ? 1 : 0, 110 kern_pg->hold_count, (u_long)kern_pg->phys_addr); 111 if ((kern_pg->queue - kern_pg->pc) == PQ_FREE) 112 panic("vm_pgmoveco: renaming free page"); 113 else 114 panic("vm_pgmoveco: renaming busy page"); 115 } 116 kpindex = kern_pg->pindex; 117 vm_page_busy(kern_pg); 118 vm_page_rename(kern_pg, uobject, upindex); 119 vm_page_flag_clear(kern_pg, PG_BUSY); 120 kern_pg->valid = VM_PAGE_BITS_ALL; 121 vm_page_unlock_queues(); 122 123 vm_map_lookup_done(map, entry); 124 return(KERN_SUCCESS); 125} 126#endif /* ZERO_COPY_SOCKETS */ 127 128int 129uiomove(void *cp, int n, struct uio *uio) 130{ 131 struct thread *td = curthread; 132 struct iovec *iov; 133 u_int cnt; 134 int error = 0; 135 int save = 0; 136 137 KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE, 138 ("uiomove: mode")); 139 KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_td == curthread, 140 ("uiomove proc")); 141 142 if (td) { 143 mtx_lock_spin(&sched_lock); 144 save = td->td_flags & TDF_DEADLKTREAT; 145 td->td_flags |= TDF_DEADLKTREAT; 146 mtx_unlock_spin(&sched_lock); 147 } 148 149 while (n > 0 && uio->uio_resid) { 150 iov = uio->uio_iov; 151 cnt = iov->iov_len; 152 if (cnt == 0) { 153 uio->uio_iov++; 154 uio->uio_iovcnt--; 155 continue; 156 } 157 if (cnt > n) 158 cnt = n; 159 160 switch (uio->uio_segflg) { 161 162 case UIO_USERSPACE: 163 if (ticks - PCPU_GET(switchticks) >= hogticks) 164 uio_yield(); 165 if (uio->uio_rw == UIO_READ) 166 error = copyout(cp, iov->iov_base, cnt); 167 else 168 error = copyin(iov->iov_base, cp, cnt); 169 if (error) 170 goto out; 171 break; 172 173 case UIO_SYSSPACE: 174 if (uio->uio_rw == UIO_READ) 175 bcopy(cp, iov->iov_base, cnt); 176 else 177 bcopy(iov->iov_base, cp, cnt); 178 break; 179 case UIO_NOCOPY: 180 break; 181 } 182 iov->iov_base = (char *)iov->iov_base + cnt; 183 iov->iov_len -= cnt; 184 uio->uio_resid -= cnt; 185 uio->uio_offset += cnt; 186 cp = (char *)cp + cnt; 187 n -= cnt; 188 } 189out: 190 if (td != curthread) printf("uiomove: IT CHANGED!"); 191 td = curthread; /* Might things have changed in copyin/copyout? */ 192 if (td) { 193 mtx_lock_spin(&sched_lock); 194 td->td_flags = (td->td_flags & ~TDF_DEADLKTREAT) | save; 195 mtx_unlock_spin(&sched_lock); 196 } 197 return (error); 198} 199 200#if defined(ENABLE_VFS_IOOPT) || defined(ZERO_COPY_SOCKETS) 201/* 202 * Experimental support for zero-copy I/O 203 */ 204static int 205userspaceco(void *cp, u_int cnt, struct uio *uio, struct vm_object *obj, 206 int disposable) 207{ 208 struct iovec *iov; 209 int error; 210 211 iov = uio->uio_iov; 212 213#ifdef ZERO_COPY_SOCKETS 214 215 if (uio->uio_rw == UIO_READ) { 216 if ((so_zero_copy_receive != 0) 217 && (obj != NULL) 218 && ((cnt & PAGE_MASK) == 0) 219 && ((((intptr_t) iov->iov_base) & PAGE_MASK) == 0) 220 && ((uio->uio_offset & PAGE_MASK) == 0) 221 && ((((intptr_t) cp) & PAGE_MASK) == 0) 222 && (obj->type == OBJT_DEFAULT) 223 && (disposable != 0)) { 224 /* SOCKET: use page-trading */ 225 /* 226 * We only want to call vm_pgmoveco() on 227 * disposeable pages, since it gives the 228 * kernel page to the userland process. 229 */ 230 error = vm_pgmoveco(&curproc->p_vmspace->vm_map, 231 obj, (vm_offset_t)cp, 232 (vm_offset_t)iov->iov_base); 233 234 /* 235 * If we get an error back, attempt 236 * to use copyout() instead. The 237 * disposable page should be freed 238 * automatically if we weren't able to move 239 * it into userland. 240 */ 241 if (error != 0) 242 error = copyout(cp, iov->iov_base, cnt); 243#ifdef ENABLE_VFS_IOOPT 244 } else if ((vfs_ioopt != 0) 245 && ((cnt & PAGE_MASK) == 0) 246 && ((((intptr_t) iov->iov_base) & PAGE_MASK) == 0) 247 && ((uio->uio_offset & PAGE_MASK) == 0) 248 && ((((intptr_t) cp) & PAGE_MASK) == 0)) { 249 error = vm_uiomove(&curproc->p_vmspace->vm_map, obj, 250 uio->uio_offset, cnt, 251 (vm_offset_t) iov->iov_base, NULL); 252#endif /* ENABLE_VFS_IOOPT */ 253 } else { 254 error = copyout(cp, iov->iov_base, cnt); 255 } 256 } else { 257 error = copyin(iov->iov_base, cp, cnt); 258 } 259#else /* ZERO_COPY_SOCKETS */ 260 if (uio->uio_rw == UIO_READ) { 261#ifdef ENABLE_VFS_IOOPT 262 if ((vfs_ioopt != 0) 263 && ((cnt & PAGE_MASK) == 0) 264 && ((((intptr_t) iov->iov_base) & PAGE_MASK) == 0) 265 && ((uio->uio_offset & PAGE_MASK) == 0) 266 && ((((intptr_t) cp) & PAGE_MASK) == 0)) { 267 error = vm_uiomove(&curproc->p_vmspace->vm_map, obj, 268 uio->uio_offset, cnt, 269 (vm_offset_t) iov->iov_base, NULL); 270 } else 271#endif /* ENABLE_VFS_IOOPT */ 272 { 273 error = copyout(cp, iov->iov_base, cnt); 274 } 275 } else { 276 error = copyin(iov->iov_base, cp, cnt); 277 } 278#endif /* ZERO_COPY_SOCKETS */ 279 280 return (error); 281} 282 283int 284uiomoveco(void *cp, int n, struct uio *uio, struct vm_object *obj, 285 int disposable) 286{ 287 struct iovec *iov; 288 u_int cnt; 289 int error; 290 291 KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE, 292 ("uiomoveco: mode")); 293 KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_td == curthread, 294 ("uiomoveco proc")); 295 296 while (n > 0 && uio->uio_resid) { 297 iov = uio->uio_iov; 298 cnt = iov->iov_len; 299 if (cnt == 0) { 300 uio->uio_iov++; 301 uio->uio_iovcnt--; 302 continue; 303 } 304 if (cnt > n) 305 cnt = n; 306 307 switch (uio->uio_segflg) { 308 309 case UIO_USERSPACE: 310 if (ticks - PCPU_GET(switchticks) >= hogticks) 311 uio_yield(); 312 313 error = userspaceco(cp, cnt, uio, obj, disposable); 314 315 if (error) 316 return (error); 317 break; 318 319 case UIO_SYSSPACE: 320 if (uio->uio_rw == UIO_READ) 321 bcopy(cp, iov->iov_base, cnt); 322 else 323 bcopy(iov->iov_base, cp, cnt); 324 break; 325 case UIO_NOCOPY: 326 break; 327 } 328 iov->iov_base = (char *)iov->iov_base + cnt; 329 iov->iov_len -= cnt; 330 uio->uio_resid -= cnt; 331 uio->uio_offset += cnt; 332 cp = (char *)cp + cnt; 333 n -= cnt; 334 } 335 return (0); 336} 337#endif /* ENABLE_VFS_IOOPT || ZERO_COPY_SOCKETS */ 338 339#ifdef ENABLE_VFS_IOOPT 340 341/* 342 * Experimental support for zero-copy I/O 343 */ 344int 345uioread(int n, struct uio *uio, struct vm_object *obj, int *nread) 346{ 347 int npagesmoved; 348 struct iovec *iov; 349 u_int cnt, tcnt; 350 int error; 351 352 *nread = 0; 353 if (vfs_ioopt < 2) 354 return 0; 355 356 error = 0; 357 358 while (n > 0 && uio->uio_resid) { 359 iov = uio->uio_iov; 360 cnt = iov->iov_len; 361 if (cnt == 0) { 362 uio->uio_iov++; 363 uio->uio_iovcnt--; 364 continue; 365 } 366 if (cnt > n) 367 cnt = n; 368 369 if ((uio->uio_segflg == UIO_USERSPACE) && 370 ((((intptr_t) iov->iov_base) & PAGE_MASK) == 0) && 371 ((uio->uio_offset & PAGE_MASK) == 0) ) { 372 373 if (cnt < PAGE_SIZE) 374 break; 375 376 cnt &= ~PAGE_MASK; 377 378 if (ticks - PCPU_GET(switchticks) >= hogticks) 379 uio_yield(); 380 error = vm_uiomove(&curproc->p_vmspace->vm_map, obj, 381 uio->uio_offset, cnt, 382 (vm_offset_t) iov->iov_base, &npagesmoved); 383 384 if (npagesmoved == 0) 385 break; 386 387 tcnt = npagesmoved * PAGE_SIZE; 388 cnt = tcnt; 389 390 if (error) 391 break; 392 393 iov->iov_base = (char *)iov->iov_base + cnt; 394 iov->iov_len -= cnt; 395 uio->uio_resid -= cnt; 396 uio->uio_offset += cnt; 397 *nread += cnt; 398 n -= cnt; 399 } else { 400 break; 401 } 402 } 403 return error; 404} 405#endif /* ENABLE_VFS_IOOPT */ 406 407/* 408 * Give next character to user as result of read. 409 */ 410int 411ureadc(int c, struct uio *uio) 412{ 413 struct iovec *iov; 414 char *iov_base; 415 416again: 417 if (uio->uio_iovcnt == 0 || uio->uio_resid == 0) 418 panic("ureadc"); 419 iov = uio->uio_iov; 420 if (iov->iov_len == 0) { 421 uio->uio_iovcnt--; 422 uio->uio_iov++; 423 goto again; 424 } 425 switch (uio->uio_segflg) { 426 427 case UIO_USERSPACE: 428 if (subyte(iov->iov_base, c) < 0) 429 return (EFAULT); 430 break; 431 432 case UIO_SYSSPACE: 433 iov_base = iov->iov_base; 434 *iov_base = c; 435 iov->iov_base = iov_base; 436 break; 437 438 case UIO_NOCOPY: 439 break; 440 } 441 iov->iov_base = (char *)iov->iov_base + 1; 442 iov->iov_len--; 443 uio->uio_resid--; 444 uio->uio_offset++; 445 return (0); 446} 447 448/* 449 * General routine to allocate a hash table. 450 */ 451void * 452hashinit(int elements, struct malloc_type *type, u_long *hashmask) 453{ 454 long hashsize; 455 LIST_HEAD(generic, generic) *hashtbl; 456 int i; 457 458 if (elements <= 0) 459 panic("hashinit: bad elements"); 460 for (hashsize = 1; hashsize <= elements; hashsize <<= 1) 461 continue; 462 hashsize >>= 1; 463 hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), type, M_WAITOK); 464 for (i = 0; i < hashsize; i++) 465 LIST_INIT(&hashtbl[i]); 466 *hashmask = hashsize - 1; 467 return (hashtbl); 468} 469 470void 471hashdestroy(void *vhashtbl, struct malloc_type *type, u_long hashmask) 472{ 473 LIST_HEAD(generic, generic) *hashtbl, *hp; 474 475 hashtbl = vhashtbl; 476 for (hp = hashtbl; hp <= &hashtbl[hashmask]; hp++) 477 if (!LIST_EMPTY(hp)) 478 panic("hashdestroy: hash not empty"); 479 free(hashtbl, type); 480} 481 482static int primes[] = { 1, 13, 31, 61, 127, 251, 509, 761, 1021, 1531, 2039, 483 2557, 3067, 3583, 4093, 4603, 5119, 5623, 6143, 6653, 484 7159, 7673, 8191, 12281, 16381, 24571, 32749 }; 485#define NPRIMES (sizeof(primes) / sizeof(primes[0])) 486 487/* 488 * General routine to allocate a prime number sized hash table. 489 */ 490void * 491phashinit(int elements, struct malloc_type *type, u_long *nentries) 492{ 493 long hashsize; 494 LIST_HEAD(generic, generic) *hashtbl; 495 int i; 496 497 if (elements <= 0) 498 panic("phashinit: bad elements"); 499 for (i = 1, hashsize = primes[1]; hashsize <= elements;) { 500 i++; 501 if (i == NPRIMES) 502 break; 503 hashsize = primes[i]; 504 } 505 hashsize = primes[i - 1]; 506 hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), type, M_WAITOK); 507 for (i = 0; i < hashsize; i++) 508 LIST_INIT(&hashtbl[i]); 509 *nentries = hashsize; 510 return (hashtbl); 511} 512 513void 514uio_yield(void) 515{ 516 struct thread *td; 517 518 td = curthread; 519 mtx_lock_spin(&sched_lock); 520 DROP_GIANT(); 521 sched_prio(td, td->td_ksegrp->kg_user_pri); /* XXXKSE */ 522 td->td_proc->p_stats->p_ru.ru_nivcsw++; 523 mi_switch(); 524 mtx_unlock_spin(&sched_lock); 525 PICKUP_GIANT(); 526} 527 528int 529copyinfrom(const void *src, void *dst, size_t len, int seg) 530{ 531 int error = 0; 532 533 switch (seg) { 534 case UIO_USERSPACE: 535 error = copyin(src, dst, len); 536 break; 537 case UIO_SYSSPACE: 538 bcopy(src, dst, len); 539 break; 540 default: 541 panic("copyinfrom: bad seg %d\n", seg); 542 } 543 return (error); 544} 545 546int 547copyinstrfrom(const void *src, void *dst, size_t len, size_t *copied, int seg) 548{ 549 int error = 0; 550 551 switch (seg) { 552 case UIO_USERSPACE: 553 error = copyinstr(src, dst, len, copied); 554 break; 555 case UIO_SYSSPACE: 556 error = copystr(src, dst, len, copied); 557 break; 558 default: 559 panic("copyinstrfrom: bad seg %d\n", seg); 560 } 561 return (error); 562} 563