subr_hash.c revision 131473
1/* 2 * Copyright (c) 1982, 1986, 1991, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 4. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)kern_subr.c 8.3 (Berkeley) 1/21/94 35 */ 36 37#include <sys/cdefs.h> 38__FBSDID("$FreeBSD: head/sys/kern/kern_subr.c 131473 2004-07-02 19:09:50Z jhb $"); 39 40#include "opt_zero.h" 41 42#include <sys/param.h> 43#include <sys/systm.h> 44#include <sys/kernel.h> 45#include <sys/ktr.h> 46#include <sys/limits.h> 47#include <sys/lock.h> 48#include <sys/mutex.h> 49#include <sys/proc.h> 50#include <sys/malloc.h> 51#include <sys/resourcevar.h> 52#include <sys/sched.h> 53#include <sys/sysctl.h> 54#include <sys/vnode.h> 55 56#include <vm/vm.h> 57#include <vm/vm_page.h> 58#include <vm/vm_map.h> 59#ifdef ZERO_COPY_SOCKETS 60#include <vm/vm_param.h> 61#include <vm/vm_object.h> 62#endif 63 64SYSCTL_INT(_kern, KERN_IOV_MAX, iov_max, CTLFLAG_RD, NULL, UIO_MAXIOV, 65 "Maximum number of elements in an I/O vector; sysconf(_SC_IOV_MAX)"); 66 67#ifdef ZERO_COPY_SOCKETS 68/* Declared in uipc_socket.c */ 69extern int so_zero_copy_receive; 70 71static int 72vm_pgmoveco(vm_map_t mapa, vm_object_t srcobj, vm_offset_t kaddr, 73 vm_offset_t uaddr) 74{ 75 vm_map_t map = mapa; 76 vm_page_t kern_pg, user_pg; 77 vm_object_t uobject; 78 vm_map_entry_t entry; 79 vm_pindex_t upindex, kpindex; 80 vm_prot_t prot; 81 boolean_t wired; 82 83 /* 84 * First lookup the kernel page. 85 */ 86 kern_pg = PHYS_TO_VM_PAGE(vtophys(kaddr)); 87 /* 88 * XXX The vm object containing kern_pg needs locking. 89 */ 90 if ((vm_map_lookup(&map, uaddr, 91 VM_PROT_WRITE, &entry, &uobject, 92 &upindex, &prot, &wired)) != KERN_SUCCESS) { 93 return(EFAULT); 94 } 95 VM_OBJECT_LOCK(uobject); 96 if ((user_pg = vm_page_lookup(uobject, upindex)) != NULL) { 97 do 98 vm_page_lock_queues(); 99 while (vm_page_sleep_if_busy(user_pg, 1, "vm_pgmoveco")); 100 vm_page_busy(user_pg); 101 pmap_remove_all(user_pg); 102 vm_page_free(user_pg); 103 } else 104 vm_page_lock_queues(); 105 if (kern_pg->busy || ((kern_pg->queue - kern_pg->pc) == PQ_FREE) || 106 (kern_pg->hold_count != 0)|| (kern_pg->flags & PG_BUSY)) { 107 printf("vm_pgmoveco: pindex(%lu), busy(%d), PG_BUSY(%d), " 108 "hold(%d) paddr(0x%lx)\n", (u_long)kern_pg->pindex, 109 kern_pg->busy, (kern_pg->flags & PG_BUSY) ? 1 : 0, 110 kern_pg->hold_count, (u_long)kern_pg->phys_addr); 111 if ((kern_pg->queue - kern_pg->pc) == PQ_FREE) 112 panic("vm_pgmoveco: renaming free page"); 113 else 114 panic("vm_pgmoveco: renaming busy page"); 115 } 116 kpindex = kern_pg->pindex; 117 vm_page_busy(kern_pg); 118 vm_page_rename(kern_pg, uobject, upindex); 119 vm_page_flag_clear(kern_pg, PG_BUSY); 120 kern_pg->valid = VM_PAGE_BITS_ALL; 121 vm_page_unlock_queues(); 122 VM_OBJECT_UNLOCK(uobject); 123 vm_map_lookup_done(map, entry); 124 return(KERN_SUCCESS); 125} 126#endif /* ZERO_COPY_SOCKETS */ 127 128int 129uiomove(void *cp, int n, struct uio *uio) 130{ 131 struct thread *td = curthread; 132 struct iovec *iov; 133 u_int cnt; 134 int error = 0; 135 int save = 0; 136 137 KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE, 138 ("uiomove: mode")); 139 KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_td == curthread, 140 ("uiomove proc")); 141 142 save = td->td_pflags & TDP_DEADLKTREAT; 143 td->td_pflags |= TDP_DEADLKTREAT; 144 145 while (n > 0 && uio->uio_resid) { 146 iov = uio->uio_iov; 147 cnt = iov->iov_len; 148 if (cnt == 0) { 149 uio->uio_iov++; 150 uio->uio_iovcnt--; 151 continue; 152 } 153 if (cnt > n) 154 cnt = n; 155 156 switch (uio->uio_segflg) { 157 158 case UIO_USERSPACE: 159 if (ticks - PCPU_GET(switchticks) >= hogticks) 160 uio_yield(); 161 if (uio->uio_rw == UIO_READ) 162 error = copyout(cp, iov->iov_base, cnt); 163 else 164 error = copyin(iov->iov_base, cp, cnt); 165 if (error) 166 goto out; 167 break; 168 169 case UIO_SYSSPACE: 170 if (uio->uio_rw == UIO_READ) 171 bcopy(cp, iov->iov_base, cnt); 172 else 173 bcopy(iov->iov_base, cp, cnt); 174 break; 175 case UIO_NOCOPY: 176 break; 177 } 178 iov->iov_base = (char *)iov->iov_base + cnt; 179 iov->iov_len -= cnt; 180 uio->uio_resid -= cnt; 181 uio->uio_offset += cnt; 182 cp = (char *)cp + cnt; 183 n -= cnt; 184 } 185out: 186 if (save == 0) 187 td->td_pflags &= ~TDP_DEADLKTREAT; 188 return (error); 189} 190 191/* 192 * Wrapper for uiomove() that validates the arguments against a known-good 193 * kernel buffer. Currently, uiomove accepts a signed (n) argument, which 194 * is almost definitely a bad thing, so we catch that here as well. We 195 * return a runtime failure, but it might be desirable to generate a runtime 196 * assertion failure instead. 197 */ 198int 199uiomove_frombuf(void *buf, int buflen, struct uio *uio) 200{ 201 unsigned int offset, n; 202 203 if (uio->uio_offset < 0 || uio->uio_resid < 0 || 204 (offset = uio->uio_offset) != uio->uio_offset) 205 return (EINVAL); 206 if (buflen <= 0 || offset >= buflen) 207 return (0); 208 if ((n = buflen - offset) > INT_MAX) 209 return (EINVAL); 210 return (uiomove((char *)buf + offset, n, uio)); 211} 212 213#ifdef ZERO_COPY_SOCKETS 214/* 215 * Experimental support for zero-copy I/O 216 */ 217static int 218userspaceco(void *cp, u_int cnt, struct uio *uio, struct vm_object *obj, 219 int disposable) 220{ 221 struct iovec *iov; 222 int error; 223 224 iov = uio->uio_iov; 225 if (uio->uio_rw == UIO_READ) { 226 if ((so_zero_copy_receive != 0) 227 && (obj != NULL) 228 && ((cnt & PAGE_MASK) == 0) 229 && ((((intptr_t) iov->iov_base) & PAGE_MASK) == 0) 230 && ((uio->uio_offset & PAGE_MASK) == 0) 231 && ((((intptr_t) cp) & PAGE_MASK) == 0) 232 && (obj->type == OBJT_DEFAULT) 233 && (disposable != 0)) { 234 /* SOCKET: use page-trading */ 235 /* 236 * We only want to call vm_pgmoveco() on 237 * disposeable pages, since it gives the 238 * kernel page to the userland process. 239 */ 240 error = vm_pgmoveco(&curproc->p_vmspace->vm_map, 241 obj, (vm_offset_t)cp, 242 (vm_offset_t)iov->iov_base); 243 244 /* 245 * If we get an error back, attempt 246 * to use copyout() instead. The 247 * disposable page should be freed 248 * automatically if we weren't able to move 249 * it into userland. 250 */ 251 if (error != 0) 252 error = copyout(cp, iov->iov_base, cnt); 253 } else { 254 error = copyout(cp, iov->iov_base, cnt); 255 } 256 } else { 257 error = copyin(iov->iov_base, cp, cnt); 258 } 259 return (error); 260} 261 262int 263uiomoveco(void *cp, int n, struct uio *uio, struct vm_object *obj, 264 int disposable) 265{ 266 struct iovec *iov; 267 u_int cnt; 268 int error; 269 270 KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE, 271 ("uiomoveco: mode")); 272 KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_td == curthread, 273 ("uiomoveco proc")); 274 275 while (n > 0 && uio->uio_resid) { 276 iov = uio->uio_iov; 277 cnt = iov->iov_len; 278 if (cnt == 0) { 279 uio->uio_iov++; 280 uio->uio_iovcnt--; 281 continue; 282 } 283 if (cnt > n) 284 cnt = n; 285 286 switch (uio->uio_segflg) { 287 288 case UIO_USERSPACE: 289 if (ticks - PCPU_GET(switchticks) >= hogticks) 290 uio_yield(); 291 292 error = userspaceco(cp, cnt, uio, obj, disposable); 293 294 if (error) 295 return (error); 296 break; 297 298 case UIO_SYSSPACE: 299 if (uio->uio_rw == UIO_READ) 300 bcopy(cp, iov->iov_base, cnt); 301 else 302 bcopy(iov->iov_base, cp, cnt); 303 break; 304 case UIO_NOCOPY: 305 break; 306 } 307 iov->iov_base = (char *)iov->iov_base + cnt; 308 iov->iov_len -= cnt; 309 uio->uio_resid -= cnt; 310 uio->uio_offset += cnt; 311 cp = (char *)cp + cnt; 312 n -= cnt; 313 } 314 return (0); 315} 316#endif /* ZERO_COPY_SOCKETS */ 317 318/* 319 * Give next character to user as result of read. 320 */ 321int 322ureadc(int c, struct uio *uio) 323{ 324 struct iovec *iov; 325 char *iov_base; 326 327again: 328 if (uio->uio_iovcnt == 0 || uio->uio_resid == 0) 329 panic("ureadc"); 330 iov = uio->uio_iov; 331 if (iov->iov_len == 0) { 332 uio->uio_iovcnt--; 333 uio->uio_iov++; 334 goto again; 335 } 336 switch (uio->uio_segflg) { 337 338 case UIO_USERSPACE: 339 if (subyte(iov->iov_base, c) < 0) 340 return (EFAULT); 341 break; 342 343 case UIO_SYSSPACE: 344 iov_base = iov->iov_base; 345 *iov_base = c; 346 iov->iov_base = iov_base; 347 break; 348 349 case UIO_NOCOPY: 350 break; 351 } 352 iov->iov_base = (char *)iov->iov_base + 1; 353 iov->iov_len--; 354 uio->uio_resid--; 355 uio->uio_offset++; 356 return (0); 357} 358 359/* 360 * General routine to allocate a hash table. 361 */ 362void * 363hashinit(int elements, struct malloc_type *type, u_long *hashmask) 364{ 365 long hashsize; 366 LIST_HEAD(generic, generic) *hashtbl; 367 int i; 368 369 if (elements <= 0) 370 panic("hashinit: bad elements"); 371 for (hashsize = 1; hashsize <= elements; hashsize <<= 1) 372 continue; 373 hashsize >>= 1; 374 hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), type, M_WAITOK); 375 for (i = 0; i < hashsize; i++) 376 LIST_INIT(&hashtbl[i]); 377 *hashmask = hashsize - 1; 378 return (hashtbl); 379} 380 381void 382hashdestroy(void *vhashtbl, struct malloc_type *type, u_long hashmask) 383{ 384 LIST_HEAD(generic, generic) *hashtbl, *hp; 385 386 hashtbl = vhashtbl; 387 for (hp = hashtbl; hp <= &hashtbl[hashmask]; hp++) 388 if (!LIST_EMPTY(hp)) 389 panic("hashdestroy: hash not empty"); 390 free(hashtbl, type); 391} 392 393static int primes[] = { 1, 13, 31, 61, 127, 251, 509, 761, 1021, 1531, 2039, 394 2557, 3067, 3583, 4093, 4603, 5119, 5623, 6143, 6653, 395 7159, 7673, 8191, 12281, 16381, 24571, 32749 }; 396#define NPRIMES (sizeof(primes) / sizeof(primes[0])) 397 398/* 399 * General routine to allocate a prime number sized hash table. 400 */ 401void * 402phashinit(int elements, struct malloc_type *type, u_long *nentries) 403{ 404 long hashsize; 405 LIST_HEAD(generic, generic) *hashtbl; 406 int i; 407 408 if (elements <= 0) 409 panic("phashinit: bad elements"); 410 for (i = 1, hashsize = primes[1]; hashsize <= elements;) { 411 i++; 412 if (i == NPRIMES) 413 break; 414 hashsize = primes[i]; 415 } 416 hashsize = primes[i - 1]; 417 hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), type, M_WAITOK); 418 for (i = 0; i < hashsize; i++) 419 LIST_INIT(&hashtbl[i]); 420 *nentries = hashsize; 421 return (hashtbl); 422} 423 424void 425uio_yield(void) 426{ 427 struct thread *td; 428 429 td = curthread; 430 mtx_lock_spin(&sched_lock); 431 DROP_GIANT(); 432 sched_prio(td, td->td_ksegrp->kg_user_pri); /* XXXKSE */ 433 mi_switch(SW_INVOL, NULL); 434 mtx_unlock_spin(&sched_lock); 435 PICKUP_GIANT(); 436} 437 438int 439copyinfrom(const void * __restrict src, void * __restrict dst, size_t len, 440 int seg) 441{ 442 int error = 0; 443 444 switch (seg) { 445 case UIO_USERSPACE: 446 error = copyin(src, dst, len); 447 break; 448 case UIO_SYSSPACE: 449 bcopy(src, dst, len); 450 break; 451 default: 452 panic("copyinfrom: bad seg %d\n", seg); 453 } 454 return (error); 455} 456 457int 458copyinstrfrom(const void * __restrict src, void * __restrict dst, size_t len, 459 size_t * __restrict copied, int seg) 460{ 461 int error = 0; 462 463 switch (seg) { 464 case UIO_USERSPACE: 465 error = copyinstr(src, dst, len, copied); 466 break; 467 case UIO_SYSSPACE: 468 error = copystr(src, dst, len, copied); 469 break; 470 default: 471 panic("copyinstrfrom: bad seg %d\n", seg); 472 } 473 return (error); 474} 475 476int 477uiofromiov(struct iovec *iovp, u_int iovcnt, struct uio *uio) 478{ 479 struct iovec *iov; 480 u_int iovlen; 481 int error, i; 482 483 /* note: can't use iovlen until iovcnt is validated */ 484 iovlen = iovcnt * sizeof (struct iovec); 485 if (iovcnt > UIO_MAXIOV) { 486 error = EINVAL; 487 goto done; 488 } 489 MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK); 490 uio->uio_iov = iov; 491 uio->uio_iovcnt = iovcnt; 492 uio->uio_segflg = UIO_USERSPACE; 493 uio->uio_offset = -1; 494 if ((error = copyin(iovp, iov, iovlen))) 495 goto done; 496 uio->uio_resid = 0; 497 for (i = 0; i < iovcnt; i++) { 498 if (iov->iov_len > INT_MAX - uio->uio_resid) { 499 error = EINVAL; 500 goto done; 501 } 502 uio->uio_resid += iov->iov_len; 503 iov++; 504 } 505 506done: 507 if (error && uio->uio_iov) { 508 FREE(uio->uio_iov, M_IOV); 509 uio->uio_iov = NULL; 510 } 511 return (error); 512 513} 514