subr_hash.c revision 109623
1/*
2 * Copyright (c) 1982, 1986, 1991, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 *    must display the following acknowledgement:
20 *	This product includes software developed by the University of
21 *	California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 *    may be used to endorse or promote products derived from this software
24 *    without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 *	@(#)kern_subr.c	8.3 (Berkeley) 1/21/94
39 * $FreeBSD: head/sys/kern/kern_subr.c 109623 2003-01-21 08:56:16Z alfred $
40 */
41
42#include "opt_zero.h"
43
44#include <sys/param.h>
45#include <sys/systm.h>
46#include <sys/kernel.h>
47#include <sys/ktr.h>
48#include <sys/lock.h>
49#include <sys/mutex.h>
50#include <sys/proc.h>
51#include <sys/malloc.h>
52#include <sys/resourcevar.h>
53#include <sys/sched.h>
54#include <sys/sysctl.h>
55#include <sys/vnode.h>
56
57#include <vm/vm.h>
58#include <vm/vm_page.h>
59#include <vm/vm_map.h>
60#ifdef ZERO_COPY_SOCKETS
61#include <vm/vm_param.h>
62#endif
63#if defined(ZERO_COPY_SOCKETS) || defined(ENABLE_VFS_IOOPT)
64#include <vm/vm_object.h>
65#endif
66
67SYSCTL_INT(_kern, KERN_IOV_MAX, iov_max, CTLFLAG_RD, NULL, UIO_MAXIOV,
68	"Maximum number of elements in an I/O vector; sysconf(_SC_IOV_MAX)");
69
70#if defined(ZERO_COPY_SOCKETS) || defined(ENABLE_VFS_IOOPT)
71static int userspaceco(caddr_t cp, u_int cnt, struct uio *uio,
72		       struct vm_object *obj, int disposable);
73#endif
74
75#ifdef ZERO_COPY_SOCKETS
76/* Declared in uipc_socket.c */
77extern int so_zero_copy_receive;
78
79static int vm_pgmoveco(vm_map_t mapa, vm_object_t srcobj, vm_offset_t kaddr,
80		       vm_offset_t uaddr);
81
82static int
83vm_pgmoveco(mapa, srcobj,  kaddr, uaddr)
84        vm_map_t mapa;
85	vm_object_t srcobj;
86	vm_offset_t kaddr, uaddr;
87{
88	vm_map_t map = mapa;
89	vm_page_t kern_pg, user_pg;
90	vm_object_t uobject;
91	vm_map_entry_t entry;
92	vm_pindex_t upindex, kpindex;
93	vm_prot_t prot;
94	boolean_t wired;
95
96	/*
97	 * First lookup the kernel page.
98	 */
99	kern_pg = PHYS_TO_VM_PAGE(vtophys(kaddr));
100
101	if ((vm_map_lookup(&map, uaddr,
102			   VM_PROT_READ, &entry, &uobject,
103			   &upindex, &prot, &wired)) != KERN_SUCCESS) {
104		return(EFAULT);
105	}
106	if ((user_pg = vm_page_lookup(uobject, upindex)) != NULL) {
107		do
108			vm_page_lock_queues();
109		while (vm_page_sleep_if_busy(user_pg, 1, "vm_pgmoveco"));
110		vm_page_busy(user_pg);
111		pmap_remove_all(user_pg);
112		vm_page_free(user_pg);
113	} else
114		vm_page_lock_queues();
115	if (kern_pg->busy || ((kern_pg->queue - kern_pg->pc) == PQ_FREE) ||
116	    (kern_pg->hold_count != 0)|| (kern_pg->flags & PG_BUSY)) {
117		printf("vm_pgmoveco: pindex(%lu), busy(%d), PG_BUSY(%d), "
118		       "hold(%d) paddr(0x%lx)\n", (u_long)kern_pg->pindex,
119			kern_pg->busy, (kern_pg->flags & PG_BUSY) ? 1 : 0,
120			kern_pg->hold_count, (u_long)kern_pg->phys_addr);
121		if ((kern_pg->queue - kern_pg->pc) == PQ_FREE)
122			panic("vm_pgmoveco: renaming free page");
123		else
124			panic("vm_pgmoveco: renaming busy page");
125	}
126	kpindex = kern_pg->pindex;
127	vm_page_busy(kern_pg);
128	vm_page_rename(kern_pg, uobject, upindex);
129	vm_page_flag_clear(kern_pg, PG_BUSY);
130	kern_pg->valid = VM_PAGE_BITS_ALL;
131	vm_page_unlock_queues();
132
133	vm_map_lookup_done(map, entry);
134	return(KERN_SUCCESS);
135}
136#endif /* ZERO_COPY_SOCKETS */
137
138int
139uiomove(cp, n, uio)
140	register caddr_t cp;
141	register int n;
142	register struct uio *uio;
143{
144	struct thread *td = curthread;
145	register struct iovec *iov;
146	u_int cnt;
147	int error = 0;
148	int save = 0;
149
150	KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE,
151	    ("uiomove: mode"));
152	KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_td == curthread,
153	    ("uiomove proc"));
154
155	if (td) {
156		mtx_lock_spin(&sched_lock);
157		save = td->td_flags & TDF_DEADLKTREAT;
158		td->td_flags |= TDF_DEADLKTREAT;
159		mtx_unlock_spin(&sched_lock);
160	}
161
162	while (n > 0 && uio->uio_resid) {
163		iov = uio->uio_iov;
164		cnt = iov->iov_len;
165		if (cnt == 0) {
166			uio->uio_iov++;
167			uio->uio_iovcnt--;
168			continue;
169		}
170		if (cnt > n)
171			cnt = n;
172
173		switch (uio->uio_segflg) {
174
175		case UIO_USERSPACE:
176			if (ticks - PCPU_GET(switchticks) >= hogticks)
177				uio_yield();
178			if (uio->uio_rw == UIO_READ)
179				error = copyout(cp, iov->iov_base, cnt);
180			else
181				error = copyin(iov->iov_base, cp, cnt);
182			if (error)
183				goto out;
184			break;
185
186		case UIO_SYSSPACE:
187			if (uio->uio_rw == UIO_READ)
188				bcopy(cp, iov->iov_base, cnt);
189			else
190				bcopy(iov->iov_base, cp, cnt);
191			break;
192		case UIO_NOCOPY:
193			break;
194		}
195		iov->iov_base = (char *)iov->iov_base + cnt;
196		iov->iov_len -= cnt;
197		uio->uio_resid -= cnt;
198		uio->uio_offset += cnt;
199		cp += cnt;
200		n -= cnt;
201	}
202out:
203	if (td != curthread) printf("uiomove: IT CHANGED!");
204	td = curthread;	/* Might things have changed in copyin/copyout? */
205	if (td) {
206		mtx_lock_spin(&sched_lock);
207		td->td_flags = (td->td_flags & ~TDF_DEADLKTREAT) | save;
208		mtx_unlock_spin(&sched_lock);
209	}
210	return (error);
211}
212
213#if defined(ENABLE_VFS_IOOPT) || defined(ZERO_COPY_SOCKETS)
214/*
215 * Experimental support for zero-copy I/O
216 */
217static int
218userspaceco(cp, cnt, uio, obj, disposable)
219	caddr_t cp;
220	u_int cnt;
221	struct uio *uio;
222	struct vm_object *obj;
223	int disposable;
224{
225	struct iovec *iov;
226	int error;
227
228	iov = uio->uio_iov;
229
230#ifdef ZERO_COPY_SOCKETS
231
232	if (uio->uio_rw == UIO_READ) {
233		if ((so_zero_copy_receive != 0)
234		 && (obj != NULL)
235		 && ((cnt & PAGE_MASK) == 0)
236		 && ((((intptr_t) iov->iov_base) & PAGE_MASK) == 0)
237		 && ((uio->uio_offset & PAGE_MASK) == 0)
238		 && ((((intptr_t) cp) & PAGE_MASK) == 0)
239		 && (obj->type == OBJT_DEFAULT)
240		 && (disposable != 0)) {
241			/* SOCKET: use page-trading */
242			/*
243			 * We only want to call vm_pgmoveco() on
244			 * disposeable pages, since it gives the
245			 * kernel page to the userland process.
246			 */
247			error =	vm_pgmoveco(&curproc->p_vmspace->vm_map,
248					    obj, (vm_offset_t)cp,
249					    (vm_offset_t)iov->iov_base);
250
251			/*
252			 * If we get an error back, attempt
253			 * to use copyout() instead.  The
254			 * disposable page should be freed
255			 * automatically if we weren't able to move
256			 * it into userland.
257			 */
258			if (error != 0)
259				error = copyout(cp, iov->iov_base, cnt);
260#ifdef ENABLE_VFS_IOOPT
261		} else if ((vfs_ioopt != 0)
262		 && ((cnt & PAGE_MASK) == 0)
263		 && ((((intptr_t) iov->iov_base) & PAGE_MASK) == 0)
264		 && ((uio->uio_offset & PAGE_MASK) == 0)
265		 && ((((intptr_t) cp) & PAGE_MASK) == 0)) {
266			error = vm_uiomove(&curproc->p_vmspace->vm_map, obj,
267					   uio->uio_offset, cnt,
268					   (vm_offset_t) iov->iov_base, NULL);
269#endif /* ENABLE_VFS_IOOPT */
270		} else {
271			error = copyout(cp, iov->iov_base, cnt);
272		}
273	} else {
274		error = copyin(iov->iov_base, cp, cnt);
275	}
276#else /* ZERO_COPY_SOCKETS */
277	if (uio->uio_rw == UIO_READ) {
278#ifdef ENABLE_VFS_IOOPT
279		if ((vfs_ioopt != 0)
280		 && ((cnt & PAGE_MASK) == 0)
281		 && ((((intptr_t) iov->iov_base) & PAGE_MASK) == 0)
282		 && ((uio->uio_offset & PAGE_MASK) == 0)
283		 && ((((intptr_t) cp) & PAGE_MASK) == 0)) {
284			error = vm_uiomove(&curproc->p_vmspace->vm_map, obj,
285					   uio->uio_offset, cnt,
286					   (vm_offset_t) iov->iov_base, NULL);
287		} else
288#endif /* ENABLE_VFS_IOOPT */
289		{
290			error = copyout(cp, iov->iov_base, cnt);
291		}
292	} else {
293		error = copyin(iov->iov_base, cp, cnt);
294	}
295#endif /* ZERO_COPY_SOCKETS */
296
297	return (error);
298}
299
300int
301uiomoveco(cp, n, uio, obj, disposable)
302	caddr_t cp;
303	int n;
304	struct uio *uio;
305	struct vm_object *obj;
306	int disposable;
307{
308	struct iovec *iov;
309	u_int cnt;
310	int error;
311
312	KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE,
313	    ("uiomoveco: mode"));
314	KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_td == curthread,
315	    ("uiomoveco proc"));
316
317	while (n > 0 && uio->uio_resid) {
318		iov = uio->uio_iov;
319		cnt = iov->iov_len;
320		if (cnt == 0) {
321			uio->uio_iov++;
322			uio->uio_iovcnt--;
323			continue;
324		}
325		if (cnt > n)
326			cnt = n;
327
328		switch (uio->uio_segflg) {
329
330		case UIO_USERSPACE:
331			if (ticks - PCPU_GET(switchticks) >= hogticks)
332				uio_yield();
333
334			error = userspaceco(cp, cnt, uio, obj, disposable);
335
336			if (error)
337				return (error);
338			break;
339
340		case UIO_SYSSPACE:
341			if (uio->uio_rw == UIO_READ)
342				bcopy(cp, iov->iov_base, cnt);
343			else
344				bcopy(iov->iov_base, cp, cnt);
345			break;
346		case UIO_NOCOPY:
347			break;
348		}
349		iov->iov_base = (char *)iov->iov_base + cnt;
350		iov->iov_len -= cnt;
351		uio->uio_resid -= cnt;
352		uio->uio_offset += cnt;
353		cp += cnt;
354		n -= cnt;
355	}
356	return (0);
357}
358#endif /* ENABLE_VFS_IOOPT || ZERO_COPY_SOCKETS */
359
360#ifdef ENABLE_VFS_IOOPT
361
362/*
363 * Experimental support for zero-copy I/O
364 */
365int
366uioread(n, uio, obj, nread)
367	int n;
368	struct uio *uio;
369	struct vm_object *obj;
370	int *nread;
371{
372	int npagesmoved;
373	struct iovec *iov;
374	u_int cnt, tcnt;
375	int error;
376
377	*nread = 0;
378	if (vfs_ioopt < 2)
379		return 0;
380
381	error = 0;
382
383	while (n > 0 && uio->uio_resid) {
384		iov = uio->uio_iov;
385		cnt = iov->iov_len;
386		if (cnt == 0) {
387			uio->uio_iov++;
388			uio->uio_iovcnt--;
389			continue;
390		}
391		if (cnt > n)
392			cnt = n;
393
394		if ((uio->uio_segflg == UIO_USERSPACE) &&
395			((((intptr_t) iov->iov_base) & PAGE_MASK) == 0) &&
396				 ((uio->uio_offset & PAGE_MASK) == 0) ) {
397
398			if (cnt < PAGE_SIZE)
399				break;
400
401			cnt &= ~PAGE_MASK;
402
403			if (ticks - PCPU_GET(switchticks) >= hogticks)
404				uio_yield();
405			error = vm_uiomove(&curproc->p_vmspace->vm_map, obj,
406						uio->uio_offset, cnt,
407						(vm_offset_t) iov->iov_base, &npagesmoved);
408
409			if (npagesmoved == 0)
410				break;
411
412			tcnt = npagesmoved * PAGE_SIZE;
413			cnt = tcnt;
414
415			if (error)
416				break;
417
418			iov->iov_base = (char *)iov->iov_base + cnt;
419			iov->iov_len -= cnt;
420			uio->uio_resid -= cnt;
421			uio->uio_offset += cnt;
422			*nread += cnt;
423			n -= cnt;
424		} else {
425			break;
426		}
427	}
428	return error;
429}
430#endif /* ENABLE_VFS_IOOPT */
431
432/*
433 * Give next character to user as result of read.
434 */
435int
436ureadc(c, uio)
437	register int c;
438	register struct uio *uio;
439{
440	register struct iovec *iov;
441	register char *iov_base;
442
443again:
444	if (uio->uio_iovcnt == 0 || uio->uio_resid == 0)
445		panic("ureadc");
446	iov = uio->uio_iov;
447	if (iov->iov_len == 0) {
448		uio->uio_iovcnt--;
449		uio->uio_iov++;
450		goto again;
451	}
452	switch (uio->uio_segflg) {
453
454	case UIO_USERSPACE:
455		if (subyte(iov->iov_base, c) < 0)
456			return (EFAULT);
457		break;
458
459	case UIO_SYSSPACE:
460		iov_base = iov->iov_base;
461		*iov_base = c;
462		iov->iov_base = iov_base;
463		break;
464
465	case UIO_NOCOPY:
466		break;
467	}
468	iov->iov_base = (char *)iov->iov_base + 1;
469	iov->iov_len--;
470	uio->uio_resid--;
471	uio->uio_offset++;
472	return (0);
473}
474
475/*
476 * General routine to allocate a hash table.
477 */
478void *
479hashinit(elements, type, hashmask)
480	int elements;
481	struct malloc_type *type;
482	u_long *hashmask;
483{
484	long hashsize;
485	LIST_HEAD(generic, generic) *hashtbl;
486	int i;
487
488	if (elements <= 0)
489		panic("hashinit: bad elements");
490	for (hashsize = 1; hashsize <= elements; hashsize <<= 1)
491		continue;
492	hashsize >>= 1;
493	hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), type, 0);
494	for (i = 0; i < hashsize; i++)
495		LIST_INIT(&hashtbl[i]);
496	*hashmask = hashsize - 1;
497	return (hashtbl);
498}
499
500void
501hashdestroy(vhashtbl, type, hashmask)
502	void *vhashtbl;
503	struct malloc_type *type;
504	u_long hashmask;
505{
506	LIST_HEAD(generic, generic) *hashtbl, *hp;
507
508	hashtbl = vhashtbl;
509	for (hp = hashtbl; hp <= &hashtbl[hashmask]; hp++)
510		if (!LIST_EMPTY(hp))
511			panic("hashdestroy: hash not empty");
512	free(hashtbl, type);
513}
514
515static int primes[] = { 1, 13, 31, 61, 127, 251, 509, 761, 1021, 1531, 2039,
516			2557, 3067, 3583, 4093, 4603, 5119, 5623, 6143, 6653,
517			7159, 7673, 8191, 12281, 16381, 24571, 32749 };
518#define NPRIMES (sizeof(primes) / sizeof(primes[0]))
519
520/*
521 * General routine to allocate a prime number sized hash table.
522 */
523void *
524phashinit(elements, type, nentries)
525	int elements;
526	struct malloc_type *type;
527	u_long *nentries;
528{
529	long hashsize;
530	LIST_HEAD(generic, generic) *hashtbl;
531	int i;
532
533	if (elements <= 0)
534		panic("phashinit: bad elements");
535	for (i = 1, hashsize = primes[1]; hashsize <= elements;) {
536		i++;
537		if (i == NPRIMES)
538			break;
539		hashsize = primes[i];
540	}
541	hashsize = primes[i - 1];
542	hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), type, 0);
543	for (i = 0; i < hashsize; i++)
544		LIST_INIT(&hashtbl[i]);
545	*nentries = hashsize;
546	return (hashtbl);
547}
548
549void
550uio_yield()
551{
552	struct thread *td;
553
554	td = curthread;
555	mtx_lock_spin(&sched_lock);
556	DROP_GIANT();
557	sched_prio(td, td->td_ksegrp->kg_user_pri); /* XXXKSE */
558	td->td_proc->p_stats->p_ru.ru_nivcsw++;
559	mi_switch();
560	mtx_unlock_spin(&sched_lock);
561	PICKUP_GIANT();
562}
563
564int
565copyinfrom(const void *src, void *dst, size_t len, int seg)
566{
567	int error = 0;
568
569	switch (seg) {
570	case UIO_USERSPACE:
571		error = copyin(src, dst, len);
572		break;
573	case UIO_SYSSPACE:
574		bcopy(src, dst, len);
575		break;
576	default:
577		panic("copyinfrom: bad seg %d\n", seg);
578	}
579	return (error);
580}
581
582int
583copyinstrfrom(const void *src, void *dst, size_t len, size_t *copied, int seg)
584{
585	int error = 0;
586
587	switch (seg) {
588	case UIO_USERSPACE:
589		error = copyinstr(src, dst, len, copied);
590		break;
591	case UIO_SYSSPACE:
592		error = copystr(src, dst, len, copied);
593		break;
594	default:
595		panic("copyinstrfrom: bad seg %d\n", seg);
596	}
597	return (error);
598}
599