subr_hash.c revision 125420
1/* 2 * Copyright (c) 1982, 1986, 1991, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * @(#)kern_subr.c 8.3 (Berkeley) 1/21/94 39 */ 40 41#include <sys/cdefs.h> 42__FBSDID("$FreeBSD: head/sys/kern/kern_subr.c 125420 2004-02-04 08:14:47Z silby $"); 43 44#include "opt_zero.h" 45 46#include <sys/param.h> 47#include <sys/systm.h> 48#include <sys/kernel.h> 49#include <sys/ktr.h> 50#include <sys/limits.h> 51#include <sys/lock.h> 52#include <sys/mutex.h> 53#include <sys/proc.h> 54#include <sys/malloc.h> 55#include <sys/resourcevar.h> 56#include <sys/sched.h> 57#include <sys/sysctl.h> 58#include <sys/vnode.h> 59 60#include <vm/vm.h> 61#include <vm/vm_page.h> 62#include <vm/vm_map.h> 63#ifdef ZERO_COPY_SOCKETS 64#include <vm/vm_param.h> 65#include <vm/vm_object.h> 66#endif 67 68SYSCTL_INT(_kern, KERN_IOV_MAX, iov_max, CTLFLAG_RD, NULL, UIO_MAXIOV, 69 "Maximum number of elements in an I/O vector; sysconf(_SC_IOV_MAX)"); 70 71#ifdef ZERO_COPY_SOCKETS 72/* Declared in uipc_socket.c */ 73extern int so_zero_copy_receive; 74 75static int 76vm_pgmoveco(vm_map_t mapa, vm_object_t srcobj, vm_offset_t kaddr, 77 vm_offset_t uaddr) 78{ 79 vm_map_t map = mapa; 80 vm_page_t kern_pg, user_pg; 81 vm_object_t uobject; 82 vm_map_entry_t entry; 83 vm_pindex_t upindex, kpindex; 84 vm_prot_t prot; 85 boolean_t wired; 86 87 /* 88 * First lookup the kernel page. 89 */ 90 kern_pg = PHYS_TO_VM_PAGE(vtophys(kaddr)); 91 /* 92 * XXX The vm object containing kern_pg needs locking. 93 */ 94 if ((vm_map_lookup(&map, uaddr, 95 VM_PROT_WRITE, &entry, &uobject, 96 &upindex, &prot, &wired)) != KERN_SUCCESS) { 97 return(EFAULT); 98 } 99 VM_OBJECT_LOCK(uobject); 100 if ((user_pg = vm_page_lookup(uobject, upindex)) != NULL) { 101 do 102 vm_page_lock_queues(); 103 while (vm_page_sleep_if_busy(user_pg, 1, "vm_pgmoveco")); 104 vm_page_busy(user_pg); 105 pmap_remove_all(user_pg); 106 vm_page_free(user_pg); 107 } else 108 vm_page_lock_queues(); 109 if (kern_pg->busy || ((kern_pg->queue - kern_pg->pc) == PQ_FREE) || 110 (kern_pg->hold_count != 0)|| (kern_pg->flags & PG_BUSY)) { 111 printf("vm_pgmoveco: pindex(%lu), busy(%d), PG_BUSY(%d), " 112 "hold(%d) paddr(0x%lx)\n", (u_long)kern_pg->pindex, 113 kern_pg->busy, (kern_pg->flags & PG_BUSY) ? 1 : 0, 114 kern_pg->hold_count, (u_long)kern_pg->phys_addr); 115 if ((kern_pg->queue - kern_pg->pc) == PQ_FREE) 116 panic("vm_pgmoveco: renaming free page"); 117 else 118 panic("vm_pgmoveco: renaming busy page"); 119 } 120 kpindex = kern_pg->pindex; 121 vm_page_busy(kern_pg); 122 vm_page_rename(kern_pg, uobject, upindex); 123 vm_page_flag_clear(kern_pg, PG_BUSY); 124 kern_pg->valid = VM_PAGE_BITS_ALL; 125 vm_page_unlock_queues(); 126 VM_OBJECT_UNLOCK(uobject); 127 vm_map_lookup_done(map, entry); 128 return(KERN_SUCCESS); 129} 130#endif /* ZERO_COPY_SOCKETS */ 131 132int 133uiomove(void *cp, int n, struct uio *uio) 134{ 135 struct thread *td = curthread; 136 struct iovec *iov; 137 u_int cnt; 138 int error = 0; 139 int save = 0; 140 141 KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE, 142 ("uiomove: mode")); 143 KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_td == curthread, 144 ("uiomove proc")); 145 146 if (td) { 147 mtx_lock_spin(&sched_lock); 148 save = td->td_flags & TDF_DEADLKTREAT; 149 td->td_flags |= TDF_DEADLKTREAT; 150 mtx_unlock_spin(&sched_lock); 151 } 152 153 while (n > 0 && uio->uio_resid) { 154 iov = uio->uio_iov; 155 cnt = iov->iov_len; 156 if (cnt == 0) { 157 uio->uio_iov++; 158 uio->uio_iovcnt--; 159 continue; 160 } 161 if (cnt > n) 162 cnt = n; 163 164 switch (uio->uio_segflg) { 165 166 case UIO_USERSPACE: 167 if (ticks - PCPU_GET(switchticks) >= hogticks) 168 uio_yield(); 169 if (uio->uio_rw == UIO_READ) 170 error = copyout(cp, iov->iov_base, cnt); 171 else 172 error = copyin(iov->iov_base, cp, cnt); 173 if (error) 174 goto out; 175 break; 176 177 case UIO_SYSSPACE: 178 if (uio->uio_rw == UIO_READ) 179 bcopy(cp, iov->iov_base, cnt); 180 else 181 bcopy(iov->iov_base, cp, cnt); 182 break; 183 case UIO_NOCOPY: 184 break; 185 } 186 iov->iov_base = (char *)iov->iov_base + cnt; 187 iov->iov_len -= cnt; 188 uio->uio_resid -= cnt; 189 uio->uio_offset += cnt; 190 cp = (char *)cp + cnt; 191 n -= cnt; 192 } 193out: 194 if (td && save == 0) { 195 mtx_lock_spin(&sched_lock); 196 td->td_flags &= ~TDF_DEADLKTREAT; 197 mtx_unlock_spin(&sched_lock); 198 } 199 return (error); 200} 201 202/* 203 * Wrapper for uiomove() that validates the arguments against a known-good 204 * kernel buffer. Currently, uiomove accepts a signed (n) argument, which 205 * is almost definitely a bad thing, so we catch that here as well. We 206 * return a runtime failure, but it might be desirable to generate a runtime 207 * assertion failure instead. 208 */ 209int 210uiomove_frombuf(void *buf, int buflen, struct uio *uio) 211{ 212 unsigned int offset, n; 213 214 if (uio->uio_offset < 0 || uio->uio_resid < 0 || 215 (offset = uio->uio_offset) != uio->uio_offset) 216 return (EINVAL); 217 if (buflen <= 0 || offset >= buflen) 218 return (0); 219 if ((n = buflen - offset) > INT_MAX) 220 return (EINVAL); 221 return (uiomove((char *)buf + offset, n, uio)); 222} 223 224#ifdef ZERO_COPY_SOCKETS 225/* 226 * Experimental support for zero-copy I/O 227 */ 228static int 229userspaceco(void *cp, u_int cnt, struct uio *uio, struct vm_object *obj, 230 int disposable) 231{ 232 struct iovec *iov; 233 int error; 234 235 iov = uio->uio_iov; 236 if (uio->uio_rw == UIO_READ) { 237 if ((so_zero_copy_receive != 0) 238 && (obj != NULL) 239 && ((cnt & PAGE_MASK) == 0) 240 && ((((intptr_t) iov->iov_base) & PAGE_MASK) == 0) 241 && ((uio->uio_offset & PAGE_MASK) == 0) 242 && ((((intptr_t) cp) & PAGE_MASK) == 0) 243 && (obj->type == OBJT_DEFAULT) 244 && (disposable != 0)) { 245 /* SOCKET: use page-trading */ 246 /* 247 * We only want to call vm_pgmoveco() on 248 * disposeable pages, since it gives the 249 * kernel page to the userland process. 250 */ 251 error = vm_pgmoveco(&curproc->p_vmspace->vm_map, 252 obj, (vm_offset_t)cp, 253 (vm_offset_t)iov->iov_base); 254 255 /* 256 * If we get an error back, attempt 257 * to use copyout() instead. The 258 * disposable page should be freed 259 * automatically if we weren't able to move 260 * it into userland. 261 */ 262 if (error != 0) 263 error = copyout(cp, iov->iov_base, cnt); 264 } else { 265 error = copyout(cp, iov->iov_base, cnt); 266 } 267 } else { 268 error = copyin(iov->iov_base, cp, cnt); 269 } 270 return (error); 271} 272 273int 274uiomoveco(void *cp, int n, struct uio *uio, struct vm_object *obj, 275 int disposable) 276{ 277 struct iovec *iov; 278 u_int cnt; 279 int error; 280 281 KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE, 282 ("uiomoveco: mode")); 283 KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_td == curthread, 284 ("uiomoveco proc")); 285 286 while (n > 0 && uio->uio_resid) { 287 iov = uio->uio_iov; 288 cnt = iov->iov_len; 289 if (cnt == 0) { 290 uio->uio_iov++; 291 uio->uio_iovcnt--; 292 continue; 293 } 294 if (cnt > n) 295 cnt = n; 296 297 switch (uio->uio_segflg) { 298 299 case UIO_USERSPACE: 300 if (ticks - PCPU_GET(switchticks) >= hogticks) 301 uio_yield(); 302 303 error = userspaceco(cp, cnt, uio, obj, disposable); 304 305 if (error) 306 return (error); 307 break; 308 309 case UIO_SYSSPACE: 310 if (uio->uio_rw == UIO_READ) 311 bcopy(cp, iov->iov_base, cnt); 312 else 313 bcopy(iov->iov_base, cp, cnt); 314 break; 315 case UIO_NOCOPY: 316 break; 317 } 318 iov->iov_base = (char *)iov->iov_base + cnt; 319 iov->iov_len -= cnt; 320 uio->uio_resid -= cnt; 321 uio->uio_offset += cnt; 322 cp = (char *)cp + cnt; 323 n -= cnt; 324 } 325 return (0); 326} 327#endif /* ZERO_COPY_SOCKETS */ 328 329/* 330 * Give next character to user as result of read. 331 */ 332int 333ureadc(int c, struct uio *uio) 334{ 335 struct iovec *iov; 336 char *iov_base; 337 338again: 339 if (uio->uio_iovcnt == 0 || uio->uio_resid == 0) 340 panic("ureadc"); 341 iov = uio->uio_iov; 342 if (iov->iov_len == 0) { 343 uio->uio_iovcnt--; 344 uio->uio_iov++; 345 goto again; 346 } 347 switch (uio->uio_segflg) { 348 349 case UIO_USERSPACE: 350 if (subyte(iov->iov_base, c) < 0) 351 return (EFAULT); 352 break; 353 354 case UIO_SYSSPACE: 355 iov_base = iov->iov_base; 356 *iov_base = c; 357 iov->iov_base = iov_base; 358 break; 359 360 case UIO_NOCOPY: 361 break; 362 } 363 iov->iov_base = (char *)iov->iov_base + 1; 364 iov->iov_len--; 365 uio->uio_resid--; 366 uio->uio_offset++; 367 return (0); 368} 369 370/* 371 * General routine to allocate a hash table. 372 */ 373void * 374hashinit(int elements, struct malloc_type *type, u_long *hashmask) 375{ 376 long hashsize; 377 LIST_HEAD(generic, generic) *hashtbl; 378 int i; 379 380 if (elements <= 0) 381 panic("hashinit: bad elements"); 382 for (hashsize = 1; hashsize <= elements; hashsize <<= 1) 383 continue; 384 hashsize >>= 1; 385 hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), type, M_WAITOK); 386 for (i = 0; i < hashsize; i++) 387 LIST_INIT(&hashtbl[i]); 388 *hashmask = hashsize - 1; 389 return (hashtbl); 390} 391 392void 393hashdestroy(void *vhashtbl, struct malloc_type *type, u_long hashmask) 394{ 395 LIST_HEAD(generic, generic) *hashtbl, *hp; 396 397 hashtbl = vhashtbl; 398 for (hp = hashtbl; hp <= &hashtbl[hashmask]; hp++) 399 if (!LIST_EMPTY(hp)) 400 panic("hashdestroy: hash not empty"); 401 free(hashtbl, type); 402} 403 404static int primes[] = { 1, 13, 31, 61, 127, 251, 509, 761, 1021, 1531, 2039, 405 2557, 3067, 3583, 4093, 4603, 5119, 5623, 6143, 6653, 406 7159, 7673, 8191, 12281, 16381, 24571, 32749 }; 407#define NPRIMES (sizeof(primes) / sizeof(primes[0])) 408 409/* 410 * General routine to allocate a prime number sized hash table. 411 */ 412void * 413phashinit(int elements, struct malloc_type *type, u_long *nentries) 414{ 415 long hashsize; 416 LIST_HEAD(generic, generic) *hashtbl; 417 int i; 418 419 if (elements <= 0) 420 panic("phashinit: bad elements"); 421 for (i = 1, hashsize = primes[1]; hashsize <= elements;) { 422 i++; 423 if (i == NPRIMES) 424 break; 425 hashsize = primes[i]; 426 } 427 hashsize = primes[i - 1]; 428 hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), type, M_WAITOK); 429 for (i = 0; i < hashsize; i++) 430 LIST_INIT(&hashtbl[i]); 431 *nentries = hashsize; 432 return (hashtbl); 433} 434 435void 436uio_yield(void) 437{ 438 struct thread *td; 439 440 td = curthread; 441 mtx_lock_spin(&sched_lock); 442 DROP_GIANT(); 443 sched_prio(td, td->td_ksegrp->kg_user_pri); /* XXXKSE */ 444 mi_switch(SW_INVOL); 445 mtx_unlock_spin(&sched_lock); 446 PICKUP_GIANT(); 447} 448 449int 450copyinfrom(const void * __restrict src, void * __restrict dst, size_t len, 451 int seg) 452{ 453 int error = 0; 454 455 switch (seg) { 456 case UIO_USERSPACE: 457 error = copyin(src, dst, len); 458 break; 459 case UIO_SYSSPACE: 460 bcopy(src, dst, len); 461 break; 462 default: 463 panic("copyinfrom: bad seg %d\n", seg); 464 } 465 return (error); 466} 467 468int 469copyinstrfrom(const void * __restrict src, void * __restrict dst, size_t len, 470 size_t * __restrict copied, int seg) 471{ 472 int error = 0; 473 474 switch (seg) { 475 case UIO_USERSPACE: 476 error = copyinstr(src, dst, len, copied); 477 break; 478 case UIO_SYSSPACE: 479 error = copystr(src, dst, len, copied); 480 break; 481 default: 482 panic("copyinstrfrom: bad seg %d\n", seg); 483 } 484 return (error); 485} 486 487int 488iov_to_uio(struct iovec *iovp, u_int iovcnt, struct uio *uio) 489{ 490 struct iovec *iov; 491 u_int iovlen; 492 int error, i; 493 494 /* note: can't use iovlen until iovcnt is validated */ 495 iovlen = iovcnt * sizeof (struct iovec); 496 if (iovcnt > UIO_MAXIOV) { 497 error = EINVAL; 498 goto done; 499 } 500 MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK); 501 uio->uio_iov = iov; 502 uio->uio_iovcnt = iovcnt; 503 uio->uio_segflg = UIO_USERSPACE; 504 uio->uio_offset = -1; 505 if ((error = copyin(iovp, iov, iovlen))) 506 goto done; 507 uio->uio_resid = 0; 508 for (i = 0; i < iovcnt; i++) { 509 if (iov->iov_len > INT_MAX - uio->uio_resid) { 510 error = EINVAL; 511 goto done; 512 } 513 uio->uio_resid += iov->iov_len; 514 iov++; 515 } 516 517done: 518 if (error && uio->uio_iov) { 519 FREE(uio->uio_iov, M_IOV); 520 uio->uio_iov = NULL; 521 } 522 return (error); 523 524} 525