subr_hash.c revision 116182
1/* 2 * Copyright (c) 1982, 1986, 1991, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * @(#)kern_subr.c 8.3 (Berkeley) 1/21/94 39 */ 40 41#include <sys/cdefs.h> 42__FBSDID("$FreeBSD: head/sys/kern/kern_subr.c 116182 2003-06-11 00:56:59Z obrien $"); 43 44#include "opt_zero.h" 45 46#include <sys/param.h> 47#include <sys/systm.h> 48#include <sys/kernel.h> 49#include <sys/ktr.h> 50#include <sys/lock.h> 51#include <sys/mutex.h> 52#include <sys/proc.h> 53#include <sys/malloc.h> 54#include <sys/resourcevar.h> 55#include <sys/sched.h> 56#include <sys/sysctl.h> 57#include <sys/vnode.h> 58 59#include <vm/vm.h> 60#include <vm/vm_page.h> 61#include <vm/vm_map.h> 62#ifdef ZERO_COPY_SOCKETS 63#include <vm/vm_param.h> 64#include <vm/vm_object.h> 65#endif 66 67SYSCTL_INT(_kern, KERN_IOV_MAX, iov_max, CTLFLAG_RD, NULL, UIO_MAXIOV, 68 "Maximum number of elements in an I/O vector; sysconf(_SC_IOV_MAX)"); 69 70#ifdef ZERO_COPY_SOCKETS 71/* Declared in uipc_socket.c */ 72extern int so_zero_copy_receive; 73 74static int 75vm_pgmoveco(vm_map_t mapa, vm_object_t srcobj, vm_offset_t kaddr, 76 vm_offset_t uaddr) 77{ 78 vm_map_t map = mapa; 79 vm_page_t kern_pg, user_pg; 80 vm_object_t uobject; 81 vm_map_entry_t entry; 82 vm_pindex_t upindex, kpindex; 83 vm_prot_t prot; 84 boolean_t wired; 85 86 /* 87 * First lookup the kernel page. 88 */ 89 kern_pg = PHYS_TO_VM_PAGE(vtophys(kaddr)); 90 /* 91 * XXX The vm object containing kern_pg needs locking. 92 */ 93 if ((vm_map_lookup(&map, uaddr, 94 VM_PROT_WRITE, &entry, &uobject, 95 &upindex, &prot, &wired)) != KERN_SUCCESS) { 96 return(EFAULT); 97 } 98 VM_OBJECT_LOCK(uobject); 99 if ((user_pg = vm_page_lookup(uobject, upindex)) != NULL) { 100 do 101 vm_page_lock_queues(); 102 while (vm_page_sleep_if_busy(user_pg, 1, "vm_pgmoveco")); 103 vm_page_busy(user_pg); 104 pmap_remove_all(user_pg); 105 vm_page_free(user_pg); 106 } else 107 vm_page_lock_queues(); 108 if (kern_pg->busy || ((kern_pg->queue - kern_pg->pc) == PQ_FREE) || 109 (kern_pg->hold_count != 0)|| (kern_pg->flags & PG_BUSY)) { 110 printf("vm_pgmoveco: pindex(%lu), busy(%d), PG_BUSY(%d), " 111 "hold(%d) paddr(0x%lx)\n", (u_long)kern_pg->pindex, 112 kern_pg->busy, (kern_pg->flags & PG_BUSY) ? 1 : 0, 113 kern_pg->hold_count, (u_long)kern_pg->phys_addr); 114 if ((kern_pg->queue - kern_pg->pc) == PQ_FREE) 115 panic("vm_pgmoveco: renaming free page"); 116 else 117 panic("vm_pgmoveco: renaming busy page"); 118 } 119 kpindex = kern_pg->pindex; 120 vm_page_busy(kern_pg); 121 vm_page_rename(kern_pg, uobject, upindex); 122 vm_page_flag_clear(kern_pg, PG_BUSY); 123 kern_pg->valid = VM_PAGE_BITS_ALL; 124 vm_page_unlock_queues(); 125 VM_OBJECT_UNLOCK(uobject); 126 vm_map_lookup_done(map, entry); 127 return(KERN_SUCCESS); 128} 129#endif /* ZERO_COPY_SOCKETS */ 130 131int 132uiomove(void *cp, int n, struct uio *uio) 133{ 134 struct thread *td = curthread; 135 struct iovec *iov; 136 u_int cnt; 137 int error = 0; 138 int save = 0; 139 140 KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE, 141 ("uiomove: mode")); 142 KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_td == curthread, 143 ("uiomove proc")); 144 145 if (td) { 146 mtx_lock_spin(&sched_lock); 147 save = td->td_flags & TDF_DEADLKTREAT; 148 td->td_flags |= TDF_DEADLKTREAT; 149 mtx_unlock_spin(&sched_lock); 150 } 151 152 while (n > 0 && uio->uio_resid) { 153 iov = uio->uio_iov; 154 cnt = iov->iov_len; 155 if (cnt == 0) { 156 uio->uio_iov++; 157 uio->uio_iovcnt--; 158 continue; 159 } 160 if (cnt > n) 161 cnt = n; 162 163 switch (uio->uio_segflg) { 164 165 case UIO_USERSPACE: 166 if (ticks - PCPU_GET(switchticks) >= hogticks) 167 uio_yield(); 168 if (uio->uio_rw == UIO_READ) 169 error = copyout(cp, iov->iov_base, cnt); 170 else 171 error = copyin(iov->iov_base, cp, cnt); 172 if (error) 173 goto out; 174 break; 175 176 case UIO_SYSSPACE: 177 if (uio->uio_rw == UIO_READ) 178 bcopy(cp, iov->iov_base, cnt); 179 else 180 bcopy(iov->iov_base, cp, cnt); 181 break; 182 case UIO_NOCOPY: 183 break; 184 } 185 iov->iov_base = (char *)iov->iov_base + cnt; 186 iov->iov_len -= cnt; 187 uio->uio_resid -= cnt; 188 uio->uio_offset += cnt; 189 cp = (char *)cp + cnt; 190 n -= cnt; 191 } 192out: 193 if (td && save == 0) { 194 mtx_lock_spin(&sched_lock); 195 td->td_flags &= ~TDF_DEADLKTREAT; 196 mtx_unlock_spin(&sched_lock); 197 } 198 return (error); 199} 200 201#ifdef ZERO_COPY_SOCKETS 202/* 203 * Experimental support for zero-copy I/O 204 */ 205static int 206userspaceco(void *cp, u_int cnt, struct uio *uio, struct vm_object *obj, 207 int disposable) 208{ 209 struct iovec *iov; 210 int error; 211 212 iov = uio->uio_iov; 213 if (uio->uio_rw == UIO_READ) { 214 if ((so_zero_copy_receive != 0) 215 && (obj != NULL) 216 && ((cnt & PAGE_MASK) == 0) 217 && ((((intptr_t) iov->iov_base) & PAGE_MASK) == 0) 218 && ((uio->uio_offset & PAGE_MASK) == 0) 219 && ((((intptr_t) cp) & PAGE_MASK) == 0) 220 && (obj->type == OBJT_DEFAULT) 221 && (disposable != 0)) { 222 /* SOCKET: use page-trading */ 223 /* 224 * We only want to call vm_pgmoveco() on 225 * disposeable pages, since it gives the 226 * kernel page to the userland process. 227 */ 228 error = vm_pgmoveco(&curproc->p_vmspace->vm_map, 229 obj, (vm_offset_t)cp, 230 (vm_offset_t)iov->iov_base); 231 232 /* 233 * If we get an error back, attempt 234 * to use copyout() instead. The 235 * disposable page should be freed 236 * automatically if we weren't able to move 237 * it into userland. 238 */ 239 if (error != 0) 240 error = copyout(cp, iov->iov_base, cnt); 241 } else { 242 error = copyout(cp, iov->iov_base, cnt); 243 } 244 } else { 245 error = copyin(iov->iov_base, cp, cnt); 246 } 247 return (error); 248} 249 250int 251uiomoveco(void *cp, int n, struct uio *uio, struct vm_object *obj, 252 int disposable) 253{ 254 struct iovec *iov; 255 u_int cnt; 256 int error; 257 258 KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE, 259 ("uiomoveco: mode")); 260 KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_td == curthread, 261 ("uiomoveco proc")); 262 263 while (n > 0 && uio->uio_resid) { 264 iov = uio->uio_iov; 265 cnt = iov->iov_len; 266 if (cnt == 0) { 267 uio->uio_iov++; 268 uio->uio_iovcnt--; 269 continue; 270 } 271 if (cnt > n) 272 cnt = n; 273 274 switch (uio->uio_segflg) { 275 276 case UIO_USERSPACE: 277 if (ticks - PCPU_GET(switchticks) >= hogticks) 278 uio_yield(); 279 280 error = userspaceco(cp, cnt, uio, obj, disposable); 281 282 if (error) 283 return (error); 284 break; 285 286 case UIO_SYSSPACE: 287 if (uio->uio_rw == UIO_READ) 288 bcopy(cp, iov->iov_base, cnt); 289 else 290 bcopy(iov->iov_base, cp, cnt); 291 break; 292 case UIO_NOCOPY: 293 break; 294 } 295 iov->iov_base = (char *)iov->iov_base + cnt; 296 iov->iov_len -= cnt; 297 uio->uio_resid -= cnt; 298 uio->uio_offset += cnt; 299 cp = (char *)cp + cnt; 300 n -= cnt; 301 } 302 return (0); 303} 304#endif /* ZERO_COPY_SOCKETS */ 305 306/* 307 * Give next character to user as result of read. 308 */ 309int 310ureadc(int c, struct uio *uio) 311{ 312 struct iovec *iov; 313 char *iov_base; 314 315again: 316 if (uio->uio_iovcnt == 0 || uio->uio_resid == 0) 317 panic("ureadc"); 318 iov = uio->uio_iov; 319 if (iov->iov_len == 0) { 320 uio->uio_iovcnt--; 321 uio->uio_iov++; 322 goto again; 323 } 324 switch (uio->uio_segflg) { 325 326 case UIO_USERSPACE: 327 if (subyte(iov->iov_base, c) < 0) 328 return (EFAULT); 329 break; 330 331 case UIO_SYSSPACE: 332 iov_base = iov->iov_base; 333 *iov_base = c; 334 iov->iov_base = iov_base; 335 break; 336 337 case UIO_NOCOPY: 338 break; 339 } 340 iov->iov_base = (char *)iov->iov_base + 1; 341 iov->iov_len--; 342 uio->uio_resid--; 343 uio->uio_offset++; 344 return (0); 345} 346 347/* 348 * General routine to allocate a hash table. 349 */ 350void * 351hashinit(int elements, struct malloc_type *type, u_long *hashmask) 352{ 353 long hashsize; 354 LIST_HEAD(generic, generic) *hashtbl; 355 int i; 356 357 if (elements <= 0) 358 panic("hashinit: bad elements"); 359 for (hashsize = 1; hashsize <= elements; hashsize <<= 1) 360 continue; 361 hashsize >>= 1; 362 hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), type, M_WAITOK); 363 for (i = 0; i < hashsize; i++) 364 LIST_INIT(&hashtbl[i]); 365 *hashmask = hashsize - 1; 366 return (hashtbl); 367} 368 369void 370hashdestroy(void *vhashtbl, struct malloc_type *type, u_long hashmask) 371{ 372 LIST_HEAD(generic, generic) *hashtbl, *hp; 373 374 hashtbl = vhashtbl; 375 for (hp = hashtbl; hp <= &hashtbl[hashmask]; hp++) 376 if (!LIST_EMPTY(hp)) 377 panic("hashdestroy: hash not empty"); 378 free(hashtbl, type); 379} 380 381static int primes[] = { 1, 13, 31, 61, 127, 251, 509, 761, 1021, 1531, 2039, 382 2557, 3067, 3583, 4093, 4603, 5119, 5623, 6143, 6653, 383 7159, 7673, 8191, 12281, 16381, 24571, 32749 }; 384#define NPRIMES (sizeof(primes) / sizeof(primes[0])) 385 386/* 387 * General routine to allocate a prime number sized hash table. 388 */ 389void * 390phashinit(int elements, struct malloc_type *type, u_long *nentries) 391{ 392 long hashsize; 393 LIST_HEAD(generic, generic) *hashtbl; 394 int i; 395 396 if (elements <= 0) 397 panic("phashinit: bad elements"); 398 for (i = 1, hashsize = primes[1]; hashsize <= elements;) { 399 i++; 400 if (i == NPRIMES) 401 break; 402 hashsize = primes[i]; 403 } 404 hashsize = primes[i - 1]; 405 hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), type, M_WAITOK); 406 for (i = 0; i < hashsize; i++) 407 LIST_INIT(&hashtbl[i]); 408 *nentries = hashsize; 409 return (hashtbl); 410} 411 412void 413uio_yield(void) 414{ 415 struct thread *td; 416 417 td = curthread; 418 mtx_lock_spin(&sched_lock); 419 DROP_GIANT(); 420 sched_prio(td, td->td_ksegrp->kg_user_pri); /* XXXKSE */ 421 td->td_proc->p_stats->p_ru.ru_nivcsw++; 422 mi_switch(); 423 mtx_unlock_spin(&sched_lock); 424 PICKUP_GIANT(); 425} 426 427int 428copyinfrom(const void *src, void *dst, size_t len, int seg) 429{ 430 int error = 0; 431 432 switch (seg) { 433 case UIO_USERSPACE: 434 error = copyin(src, dst, len); 435 break; 436 case UIO_SYSSPACE: 437 bcopy(src, dst, len); 438 break; 439 default: 440 panic("copyinfrom: bad seg %d\n", seg); 441 } 442 return (error); 443} 444 445int 446copyinstrfrom(const void *src, void *dst, size_t len, size_t *copied, int seg) 447{ 448 int error = 0; 449 450 switch (seg) { 451 case UIO_USERSPACE: 452 error = copyinstr(src, dst, len, copied); 453 break; 454 case UIO_SYSSPACE: 455 error = copystr(src, dst, len, copied); 456 break; 457 default: 458 panic("copyinstrfrom: bad seg %d\n", seg); 459 } 460 return (error); 461} 462