1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27#include <mtmalloc.h>
28#include "mtmalloc_impl.h"
29#include <unistd.h>
30#include <synch.h>
31#include <thread.h>
32#include <pthread.h>
33#include <stdio.h>
34#include <limits.h>
35#include <errno.h>
36#include <string.h>
37#include <strings.h>
38#include <sys/param.h>
39#include <sys/sysmacros.h>
40
41/*
42 * To turn on the asserts just compile -DDEBUG
43 */
44
45#ifndef	DEBUG
46#define	NDEBUG
47#endif
48
49#include <assert.h>
50
51/*
52 * The MT hot malloc implementation contained herein is designed to be
53 * plug-compatible with the libc version of malloc. It is not intended
54 * to replace that implementation until we decide that it is ok to break
55 * customer apps (Solaris 3.0).
56 *
57 * For requests up to 2^^16, the allocator initializes itself into NCPUS
58 * worth of chains of caches. When a memory request is made, the calling thread
59 * is vectored into one of NCPUS worth of caches.  The LWP id gives us a cheap,
60 * contention-reducing index to use, eventually, this should be replaced with
61 * the actual CPU sequence number, when an interface to get it is available.
62 *
63 * Once the thread is vectored into one of the list of caches the real
64 * allocation of the memory begins. The size is determined to figure out which
65 * bucket the allocation should be satisfied from. The management of free
66 * buckets is done via a bitmask. A free bucket is represented by a 1. The
67 * first free bit represents the first free bucket. The position of the bit,
68 * represents the position of the bucket in the arena.
69 *
70 * When the memory from the arena is handed out, the address of the cache
71 * control structure is written in the word preceeding the returned memory.
72 * This cache control address is used during free() to mark the buffer free
73 * in the cache control structure.
74 *
75 * When all available memory in a cache has been depleted, a new chunk of memory
76 * is allocated via sbrk(). The new cache is allocated from this chunk of memory
77 * and initialized in the function create_cache(). New caches are installed at
78 * the front of a singly linked list of the same size memory pools. This helps
79 * to ensure that there will tend to be available memory in the beginning of the
80 * list.
81 *
82 * Long linked lists hurt performance. To decrease this effect, there is a
83 * tunable, requestsize, that bumps up the sbrk allocation size and thus
84 * increases the number of available blocks within an arena.  We also keep
85 * a "hint" for each cache list, which is the last cache in the list allocated
86 * from.  This lowers the cost of searching if there are a lot of fully
87 * allocated blocks at the front of the list.
88 *
89 * For requests greater than 2^^16 (oversize allocations), there are two pieces
90 * of overhead. There is the OVERHEAD used to hold the cache addr
91 * (&oversize_list), plus an oversize_t structure to further describe the block.
92 *
93 * The oversize list is kept as defragmented as possible by coalescing
94 * freed oversized allocations with adjacent neighbors.
95 *
96 * Addresses handed out are stored in a hash table, and are aligned on
97 * MTMALLOC_MIN_ALIGN-byte boundaries at both ends. Request sizes are rounded-up
98 * where necessary in order to achieve this. This eases the implementation of
99 * MTDEBUGPATTERN and MTINITPATTERN, particularly where coalescing occurs.
100 *
101 * A memalign allocation takes memalign header overhead.  There's two
102 * types of memalign headers distinguished by MTMALLOC_MEMALIGN_MAGIC
103 * and MTMALLOC_MEMALIGN_MIN_MAGIC.  When the size of memory taken to
104 * get to the aligned address from malloc'ed address is the minimum size
105 * OVERHEAD, we create a header taking only one OVERHEAD space with magic
106 * number MTMALLOC_MEMALIGN_MIN_MAGIC, and we know by subtracting OVERHEAD
107 * from memaligned address, we can get to the malloc'ed address. Otherwise,
108 * we create a memalign header taking two OVERHEAD space, one stores
109 * MTMALLOC_MEMALIGN_MAGIC magic number, the other one points back to the
110 * malloc'ed address.
111 */
112
113#if defined(__i386) || defined(__amd64)
114#include <arpa/inet.h>	/* for htonl() */
115#endif
116
117static void * morecore(size_t);
118static void create_cache(cache_t *, size_t bufsize, uint_t hunks);
119static void * malloc_internal(size_t, percpu_t *);
120static void * oversize(size_t);
121static oversize_t *find_oversize(size_t);
122static void add_oversize(oversize_t *);
123static void copy_pattern(uint32_t, void *, size_t);
124static void * verify_pattern(uint32_t, void *, size_t);
125static void reinit_cpu_list(void);
126static void reinit_cache(cache_t *);
127static void free_oversize(oversize_t *);
128static oversize_t *oversize_header_alloc(uintptr_t, size_t);
129
130/*
131 * oversize hash table stuff
132 */
133#define	NUM_BUCKETS	67	/* must be prime */
134#define	HASH_OVERSIZE(caddr)	((uintptr_t)(caddr) % NUM_BUCKETS)
135oversize_t *ovsz_hashtab[NUM_BUCKETS];
136
137#define	ALIGN(x, a)	((((uintptr_t)(x) + ((uintptr_t)(a) - 1)) \
138			& ~((uintptr_t)(a) - 1)))
139
140/* need this to deal with little endianess of x86 */
141#if defined(__i386) || defined(__amd64)
142#define	FLIP_EM(x)	htonl((x))
143#else
144#define	FLIP_EM(x)	(x)
145#endif
146
147#define	INSERT_ONLY			0
148#define	COALESCE_LEFT			0x00000001
149#define	COALESCE_RIGHT			0x00000002
150#define	COALESCE_WITH_BOTH_SIDES	(COALESCE_LEFT | COALESCE_RIGHT)
151
152#define	OVERHEAD	8	/* size needed to write cache addr */
153#define	HUNKSIZE	8192	/* just a multiplier */
154
155#define	MAX_CACHED_SHIFT	16	/* 64K is the max cached size */
156#define	MAX_CACHED		(1 << MAX_CACHED_SHIFT)
157#define	MIN_CACHED_SHIFT	4	/* smaller requests rounded up */
158#define	MTMALLOC_MIN_ALIGN	8	/* min guaranteed alignment */
159
160/* maximum size before overflow */
161#define	MAX_MTMALLOC	(SIZE_MAX - (SIZE_MAX % MTMALLOC_MIN_ALIGN) \
162			- OVSZ_HEADER_SIZE)
163
164#define	NUM_CACHES	(MAX_CACHED_SHIFT - MIN_CACHED_SHIFT + 1)
165#define	CACHELIST_SIZE	ALIGN(NUM_CACHES * sizeof (cache_head_t), \
166    CACHE_COHERENCY_UNIT)
167
168#define	MINSIZE		9	/* for requestsize, tunable */
169#define	MAXSIZE		256	/* arbitrary, big enough, for requestsize */
170
171#define	FREEPATTERN	0xdeadbeef /* debug fill pattern for free buf */
172#define	INITPATTERN	0xbaddcafe /* debug fill pattern for new buf */
173
174#define	misaligned(p)	((unsigned)(p) & (sizeof (int) - 1))
175#define	IS_OVERSIZE(x, y)	(((x) < (y)) && (((x) > MAX_CACHED)? 1 : 0))
176
177static long requestsize = MINSIZE; /* 9 pages per cache; tunable; 9 is min */
178
179static uint_t cpu_mask;
180static curcpu_func curcpu;
181
182static int32_t debugopt;
183static int32_t reinit;
184
185static percpu_t *cpu_list;
186static oversize_t oversize_list;
187static mutex_t oversize_lock = DEFAULTMUTEX;
188
189static int ncpus = 0;
190
191#define	MTMALLOC_OVERSIZE_MAGIC		((uintptr_t)&oversize_list)
192#define	MTMALLOC_MEMALIGN_MAGIC		((uintptr_t)&oversize_list + 1)
193#define	MTMALLOC_MEMALIGN_MIN_MAGIC	((uintptr_t)&oversize_list + 2)
194
195/*
196 * We require allocations handed out to be aligned on MTMALLOC_MIN_ALIGN-byte
197 * boundaries. We round up sizeof (oversize_t) (when necessary) to ensure that
198 * this is achieved.
199 */
200#define	OVSZ_SIZE		(ALIGN(sizeof (oversize_t), MTMALLOC_MIN_ALIGN))
201#define	OVSZ_HEADER_SIZE	(OVSZ_SIZE + OVERHEAD)
202
203/*
204 * memalign header takes 2 OVERHEAD space.  One for memalign magic, and the
205 * other one points back to the start address of originally allocated space.
206 */
207#define	MEMALIGN_HEADER_SIZE	2 * OVERHEAD
208#define	MEMALIGN_HEADER_ALLOC(x, shift, malloc_addr)\
209	if (shift == OVERHEAD)\
210		*((uintptr_t *)((caddr_t)x - OVERHEAD)) = \
211			MTMALLOC_MEMALIGN_MIN_MAGIC; \
212	else {\
213		*((uintptr_t *)((caddr_t)x - OVERHEAD)) = \
214			MTMALLOC_MEMALIGN_MAGIC; \
215		*((uintptr_t *)((caddr_t)x - 2 * OVERHEAD)) = \
216			(uintptr_t)malloc_addr; \
217	}
218
219/*
220 * Add big to the oversize hash table at the head of the relevant bucket.
221 */
222static void
223insert_hash(oversize_t *big)
224{
225	caddr_t ret = big->addr;
226	int bucket = HASH_OVERSIZE(ret);
227
228	assert(MUTEX_HELD(&oversize_lock));
229	big->hash_next = ovsz_hashtab[bucket];
230	ovsz_hashtab[bucket] = big;
231}
232
233void *
234malloc(size_t bytes)
235{
236	percpu_t *list_rotor;
237	uint_t	list_index;
238
239	if (bytes > MAX_CACHED)
240		return (oversize(bytes));
241
242	list_index = (curcpu() & cpu_mask);
243
244	list_rotor = &cpu_list[list_index];
245
246	return (malloc_internal(bytes, list_rotor));
247}
248
249void *
250realloc(void * ptr, size_t bytes)
251{
252	void *new, *data_ptr;
253	cache_t *cacheptr;
254	caddr_t mem;
255	size_t shift = 0;
256
257	if (ptr == NULL)
258		return (malloc(bytes));
259
260	if (bytes == 0) {
261		free(ptr);
262		return (NULL);
263	}
264
265	data_ptr = ptr;
266	mem = (caddr_t)ptr - OVERHEAD;
267
268	/*
269	 * Optimization possibility :
270	 *	p = malloc(64);
271	 *	q = realloc(p, 64);
272	 * q can be same as p.
273	 * Apply this optimization for the normal
274	 * sized caches for now.
275	 */
276	if (*(uintptr_t *)mem < MTMALLOC_OVERSIZE_MAGIC ||
277	    *(uintptr_t *)mem > MTMALLOC_MEMALIGN_MIN_MAGIC) {
278		cacheptr = (cache_t *)*(uintptr_t *)mem;
279		if (bytes <= (cacheptr->mt_size - OVERHEAD))
280			return (ptr);
281	}
282
283	new = malloc(bytes);
284
285	if (new == NULL)
286		return (NULL);
287
288	/*
289	 * If new == ptr, ptr has previously been freed. Passing a freed pointer
290	 * to realloc() is not allowed - unless the caller specifically states
291	 * otherwise, in which case we must avoid freeing ptr (ie new) before we
292	 * return new. There is (obviously) no requirement to memcpy() ptr to
293	 * new before we return.
294	 */
295	if (new == ptr) {
296		if (!(debugopt & MTDOUBLEFREE))
297			abort();
298		return (new);
299	}
300
301	if (*(uintptr_t *)mem == MTMALLOC_MEMALIGN_MAGIC) {
302		mem -= OVERHEAD;
303		ptr = (void *)*(uintptr_t *)mem;
304		mem = (caddr_t)ptr - OVERHEAD;
305		shift = (size_t)((uintptr_t)data_ptr - (uintptr_t)ptr);
306	} else if (*(uintptr_t *)mem == MTMALLOC_MEMALIGN_MIN_MAGIC) {
307		ptr = (void *) mem;
308		mem -= OVERHEAD;
309		shift = OVERHEAD;
310	}
311
312	if (*(uintptr_t *)mem == MTMALLOC_OVERSIZE_MAGIC) {
313		oversize_t *old;
314
315		old = (oversize_t *)(mem - OVSZ_SIZE);
316		(void) memcpy(new, data_ptr, MIN(bytes, old->size - shift));
317		free(ptr);
318		return (new);
319	}
320
321	cacheptr = (cache_t *)*(uintptr_t *)mem;
322
323	(void) memcpy(new, data_ptr,
324	    MIN(cacheptr->mt_size - OVERHEAD - shift, bytes));
325	free(ptr);
326
327	return (new);
328}
329
330void *
331calloc(size_t nelem, size_t bytes)
332{
333	void * ptr;
334	size_t size = nelem * bytes;
335
336	ptr = malloc(size);
337	if (ptr == NULL)
338		return (NULL);
339	(void) memset(ptr, 0, size);
340
341	return (ptr);
342}
343
344void
345free(void * ptr)
346{
347	cache_t *cacheptr;
348	caddr_t mem;
349	int32_t i;
350	caddr_t freeblocks;
351	uintptr_t offset;
352	uchar_t mask;
353	int32_t which_bit, num_bytes;
354
355	if (ptr == NULL)
356		return;
357
358	mem = (caddr_t)ptr - OVERHEAD;
359
360	if (*(uintptr_t *)mem == MTMALLOC_MEMALIGN_MAGIC) {
361		mem -= OVERHEAD;
362		ptr = (void *)*(uintptr_t *)mem;
363		mem = (caddr_t)ptr - OVERHEAD;
364	} else if (*(uintptr_t *)mem == MTMALLOC_MEMALIGN_MIN_MAGIC) {
365		ptr = (void *) mem;
366		mem -= OVERHEAD;
367	}
368
369	if (*(uintptr_t *)mem == MTMALLOC_OVERSIZE_MAGIC) {
370		oversize_t *big, **opp;
371		int bucket;
372
373		big = (oversize_t *)(mem - OVSZ_SIZE);
374		(void) mutex_lock(&oversize_lock);
375
376		bucket = HASH_OVERSIZE(big->addr);
377		for (opp = &ovsz_hashtab[bucket]; *opp != NULL;
378		    opp = &(*opp)->hash_next)
379			if (*opp == big)
380				break;
381
382		if (*opp == NULL) {
383			if (!(debugopt & MTDOUBLEFREE))
384				abort();
385			(void) mutex_unlock(&oversize_lock);
386			return;
387		}
388
389		*opp = big->hash_next;	/* remove big from the hash table */
390		big->hash_next = NULL;
391
392		if (debugopt & MTDEBUGPATTERN)
393			copy_pattern(FREEPATTERN, ptr, big->size);
394		add_oversize(big);
395		(void) mutex_unlock(&oversize_lock);
396		return;
397	}
398
399	cacheptr = (cache_t *)*(uintptr_t *)mem;
400	freeblocks = cacheptr->mt_freelist;
401
402	/*
403	 * This is the distance measured in bits into the arena.
404	 * The value of offset is in bytes but there is a 1-1 correlation
405	 * between distance into the arena and distance into the
406	 * freelist bitmask.
407	 */
408	offset = mem - cacheptr->mt_arena;
409
410	/*
411	 * i is total number of bits to offset into freelist bitmask.
412	 */
413
414	i = offset / cacheptr->mt_size;
415
416	num_bytes = i >> 3;
417
418	/*
419	 * which_bit is the bit offset into the byte in the freelist.
420	 * if our freelist bitmask looks like 0xf3 and we are freeing
421	 * block 5 (ie: the 6th block) our mask will be 0xf7 after
422	 * the free. Things go left to right that's why the mask is 0x80
423	 * and not 0x01.
424	 */
425	which_bit = i - (num_bytes << 3);
426
427	mask = 0x80 >> which_bit;
428
429	freeblocks += num_bytes;
430
431	if (debugopt & MTDEBUGPATTERN)
432		copy_pattern(FREEPATTERN, ptr, cacheptr->mt_size - OVERHEAD);
433
434	(void) mutex_lock(&cacheptr->mt_cache_lock);
435
436	if (*freeblocks & mask) {
437		if (!(debugopt & MTDOUBLEFREE))
438			abort();
439	} else {
440		*freeblocks |= mask;
441		cacheptr->mt_nfree++;
442	}
443
444	(void) mutex_unlock(&cacheptr->mt_cache_lock);
445}
446
447void *
448memalign(size_t alignment, size_t size)
449{
450	size_t alloc_size;
451	uintptr_t offset;
452	void *alloc_buf;
453	void *ret_buf;
454
455	if (size == 0 || alignment == 0 || misaligned(alignment) ||
456	    (alignment & (alignment - 1)) != 0) {
457		errno = EINVAL;
458		return (NULL);
459	}
460
461	/* <= MTMALLOC_MIN_ALIGN, malloc can provide directly */
462	if (alignment <= MTMALLOC_MIN_ALIGN)
463		return (malloc(size));
464
465	alloc_size = size + alignment - MTMALLOC_MIN_ALIGN;
466
467	if (alloc_size < size) { /* overflow */
468		errno = ENOMEM;
469		return (NULL);
470	}
471
472	alloc_buf = malloc(alloc_size);
473
474	if (alloc_buf == NULL)
475		/* malloc sets errno */
476		return (NULL);
477
478	/*
479	 * If alloc_size > MAX_CACHED, malloc() will have returned a multiple of
480	 * MTMALLOC_MIN_ALIGN, having rounded-up alloc_size if necessary. Since
481	 * we will use alloc_size to return the excess fragments to the free
482	 * list, we also round-up alloc_size if necessary.
483	 */
484	if ((alloc_size > MAX_CACHED) &&
485	    (alloc_size & (MTMALLOC_MIN_ALIGN - 1)))
486		alloc_size = ALIGN(alloc_size, MTMALLOC_MIN_ALIGN);
487
488	if ((offset = (uintptr_t)alloc_buf & (alignment - 1)) == 0) {
489		/* aligned correctly */
490
491		size_t frag_size = alloc_size -
492		    (size + MTMALLOC_MIN_ALIGN + OVSZ_HEADER_SIZE);
493
494		/*
495		 * If the leftover piece of the memory > MAX_CACHED,
496		 * split off the piece and return it back to the freelist.
497		 */
498		if (IS_OVERSIZE(frag_size, alloc_size)) {
499			oversize_t *orig, *tail;
500			uintptr_t taddr;
501			size_t data_size;
502			taddr = ALIGN((uintptr_t)alloc_buf + size,
503			    MTMALLOC_MIN_ALIGN);
504			data_size = taddr - (uintptr_t)alloc_buf;
505			orig = (oversize_t *)((uintptr_t)alloc_buf -
506			    OVSZ_HEADER_SIZE);
507			frag_size = orig->size - data_size -
508			    OVSZ_HEADER_SIZE;
509			orig->size = data_size;
510			tail = oversize_header_alloc(taddr, frag_size);
511			free_oversize(tail);
512		}
513		ret_buf = alloc_buf;
514	} else {
515		uchar_t	oversize_bits = 0;
516		size_t	head_sz, data_sz, tail_sz;
517		uintptr_t ret_addr, taddr, shift, tshift;
518		oversize_t *orig, *tail, *big;
519		size_t tsize;
520
521		/* needs to be aligned */
522		shift = alignment - offset;
523
524		assert(shift >= MTMALLOC_MIN_ALIGN);
525
526		ret_addr = ((uintptr_t)alloc_buf + shift);
527		ret_buf = (void *)ret_addr;
528
529		if (alloc_size <= MAX_CACHED) {
530			MEMALIGN_HEADER_ALLOC(ret_addr, shift, alloc_buf);
531			return (ret_buf);
532		}
533
534		/*
535		 * Only check for the fragments when the memory is allocted
536		 * from oversize_list.  Split off a fragment and return it
537		 * to the oversize freelist when it's > MAX_CACHED.
538		 */
539
540		head_sz = shift - MAX(MEMALIGN_HEADER_SIZE, OVSZ_HEADER_SIZE);
541
542		tail_sz = alloc_size -
543		    (shift + size + MTMALLOC_MIN_ALIGN + OVSZ_HEADER_SIZE);
544
545		oversize_bits |= IS_OVERSIZE(head_sz, alloc_size) |
546		    IS_OVERSIZE(size, alloc_size) << DATA_SHIFT |
547		    IS_OVERSIZE(tail_sz, alloc_size) << TAIL_SHIFT;
548
549		switch (oversize_bits) {
550			case NONE_OVERSIZE:
551			case DATA_OVERSIZE:
552				MEMALIGN_HEADER_ALLOC(ret_addr, shift,
553				    alloc_buf);
554				break;
555			case HEAD_OVERSIZE:
556				/*
557				 * If we can extend data > MAX_CACHED and have
558				 * head still > MAX_CACHED, we split head-end
559				 * as the case of head-end and data oversized,
560				 * otherwise just create memalign header.
561				 */
562				tsize = (shift + size) - (MAX_CACHED + 8 +
563				    MTMALLOC_MIN_ALIGN + OVSZ_HEADER_SIZE);
564
565				if (!IS_OVERSIZE(tsize, alloc_size)) {
566					MEMALIGN_HEADER_ALLOC(ret_addr, shift,
567					    alloc_buf);
568					break;
569				} else {
570					tsize += OVSZ_HEADER_SIZE;
571					taddr = ALIGN((uintptr_t)alloc_buf +
572					    tsize, MTMALLOC_MIN_ALIGN);
573					tshift = ret_addr - taddr;
574					MEMALIGN_HEADER_ALLOC(ret_addr, tshift,
575					    taddr);
576					ret_addr = taddr;
577					shift = ret_addr - (uintptr_t)alloc_buf;
578				}
579				/* FALLTHROUGH */
580			case HEAD_AND_DATA_OVERSIZE:
581				/*
582				 * Split off the head fragment and
583				 * return it back to oversize freelist.
584				 * Create oversize header for the piece
585				 * of (data + tail fragment).
586				 */
587				orig = (oversize_t *)((uintptr_t)alloc_buf -
588				    OVSZ_HEADER_SIZE);
589				big = oversize_header_alloc(ret_addr -
590				    OVSZ_HEADER_SIZE, (orig->size - shift));
591				(void) mutex_lock(&oversize_lock);
592				insert_hash(big);
593				(void) mutex_unlock(&oversize_lock);
594				orig->size = shift - OVSZ_HEADER_SIZE;
595
596				/* free up the head fragment */
597				free_oversize(orig);
598				break;
599			case TAIL_OVERSIZE:
600				/*
601				 * If we can extend data > MAX_CACHED and have
602				 * tail-end still > MAX_CACHED, we split tail
603				 * end, otherwise just create memalign header.
604				 */
605				orig = (oversize_t *)((uintptr_t)alloc_buf -
606				    OVSZ_HEADER_SIZE);
607				tsize =  orig->size - (MAX_CACHED + 8 +
608				    shift + OVSZ_HEADER_SIZE +
609				    MTMALLOC_MIN_ALIGN);
610				if (!IS_OVERSIZE(tsize, alloc_size)) {
611					MEMALIGN_HEADER_ALLOC(ret_addr, shift,
612					    alloc_buf);
613					break;
614				} else {
615					size = MAX_CACHED + 8;
616				}
617				/* FALLTHROUGH */
618			case DATA_AND_TAIL_OVERSIZE:
619				/*
620				 * Split off the tail fragment and
621				 * return it back to oversize freelist.
622				 * Create memalign header and adjust
623				 * the size for the piece of
624				 * (head fragment + data).
625				 */
626				taddr = ALIGN(ret_addr + size,
627				    MTMALLOC_MIN_ALIGN);
628				data_sz = (size_t)(taddr -
629				    (uintptr_t)alloc_buf);
630				orig = (oversize_t *)((uintptr_t)alloc_buf -
631				    OVSZ_HEADER_SIZE);
632				tsize = orig->size - data_sz;
633				orig->size = data_sz;
634				MEMALIGN_HEADER_ALLOC(ret_buf, shift,
635				    alloc_buf);
636				tsize -= OVSZ_HEADER_SIZE;
637				tail = oversize_header_alloc(taddr,  tsize);
638				free_oversize(tail);
639				break;
640			case HEAD_AND_TAIL_OVERSIZE:
641				/*
642				 * Split off the head fragment.
643				 * We try to free up tail-end when we can
644				 * extend data size to (MAX_CACHED + 8)
645				 * and remain tail-end oversized.
646				 * The bottom line is all split pieces
647				 * should be oversize in size.
648				 */
649				orig = (oversize_t *)((uintptr_t)alloc_buf -
650				    OVSZ_HEADER_SIZE);
651				tsize =  orig->size - (MAX_CACHED + 8 +
652				    OVSZ_HEADER_SIZE + shift +
653				    MTMALLOC_MIN_ALIGN);
654
655				if (!IS_OVERSIZE(tsize, alloc_size)) {
656					/*
657					 * If the chunk is not big enough
658					 * to make both data and tail oversize
659					 * we just keep them as one piece.
660					 */
661					big = oversize_header_alloc(ret_addr -
662					    OVSZ_HEADER_SIZE,
663					    orig->size - shift);
664					(void) mutex_lock(&oversize_lock);
665					insert_hash(big);
666					(void) mutex_unlock(&oversize_lock);
667					orig->size = shift - OVSZ_HEADER_SIZE;
668					free_oversize(orig);
669					break;
670				} else {
671					/*
672					 * extend data size > MAX_CACHED
673					 * and handle it as head, data, tail
674					 * are all oversized.
675					 */
676					size = MAX_CACHED + 8;
677				}
678				/* FALLTHROUGH */
679			case ALL_OVERSIZE:
680				/*
681				 * split off the head and tail fragments,
682				 * return them back to the oversize freelist.
683				 * Alloc oversize header for data seg.
684				 */
685				orig = (oversize_t *)((uintptr_t)alloc_buf -
686				    OVSZ_HEADER_SIZE);
687				tsize = orig->size;
688				orig->size = shift - OVSZ_HEADER_SIZE;
689				free_oversize(orig);
690
691				taddr = ALIGN(ret_addr + size,
692				    MTMALLOC_MIN_ALIGN);
693				data_sz = taddr - ret_addr;
694				assert(tsize > (shift + data_sz +
695				    OVSZ_HEADER_SIZE));
696				tail_sz = tsize -
697				    (shift + data_sz + OVSZ_HEADER_SIZE);
698
699				/* create oversize header for data seg */
700				big = oversize_header_alloc(ret_addr -
701				    OVSZ_HEADER_SIZE, data_sz);
702				(void) mutex_lock(&oversize_lock);
703				insert_hash(big);
704				(void) mutex_unlock(&oversize_lock);
705
706				/* create oversize header for tail fragment */
707				tail = oversize_header_alloc(taddr, tail_sz);
708				free_oversize(tail);
709				break;
710			default:
711				/* should not reach here */
712				assert(0);
713		}
714	}
715	return (ret_buf);
716}
717
718
719void *
720valloc(size_t size)
721{
722	static unsigned pagesize;
723
724	if (size == 0)
725		return (NULL);
726
727	if (!pagesize)
728		pagesize = sysconf(_SC_PAGESIZE);
729
730	return (memalign(pagesize, size));
731}
732
733void
734mallocctl(int cmd, long value)
735{
736	switch (cmd) {
737
738	case MTDEBUGPATTERN:
739		/*
740		 * Reinitialize free blocks in case malloc() is called prior
741		 * to mallocctl().
742		 */
743		if (value && !(debugopt & cmd)) {
744			reinit++;
745			debugopt |= cmd;
746			reinit_cpu_list();
747		}
748		/*FALLTHRU*/
749	case MTDOUBLEFREE:
750	case MTINITBUFFER:
751		if (value)
752			debugopt |= cmd;
753		else
754			debugopt &= ~cmd;
755		break;
756	case MTCHUNKSIZE:
757		if (value >= MINSIZE && value <= MAXSIZE)
758			requestsize = value;
759		break;
760	default:
761		break;
762	}
763}
764
765/*
766 * Initialization function, called from the init section of the library.
767 * No locking is required here because we are single-threaded during
768 * library initialization.
769 */
770static void
771setup_caches(void)
772{
773	uintptr_t oldbrk;
774	uintptr_t newbrk;
775
776	size_t cache_space_needed;
777	size_t padding;
778
779	curcpu_func new_curcpu;
780	uint_t new_cpu_mask;
781	percpu_t *new_cpu_list;
782
783	uint_t i, j;
784	uintptr_t list_addr;
785
786	/*
787	 * Get a decent "current cpu identifier", to be used to reduce
788	 * contention.  Eventually, this should be replaced by an interface
789	 * to get the actual CPU sequence number in libthread/liblwp.
790	 */
791	new_curcpu = (curcpu_func)thr_self;
792	if ((ncpus = 2 * sysconf(_SC_NPROCESSORS_CONF)) <= 0)
793		ncpus = 4; /* decent default value */
794
795	/* round ncpus up to a power of 2 */
796	while (ncpus & (ncpus - 1))
797		ncpus++;
798
799	new_cpu_mask = ncpus - 1;	/* create the cpu mask */
800
801	/*
802	 * We now do some magic with the brk.  What we want to get in the
803	 * end is a bunch of well-aligned stuff in a big initial allocation.
804	 * Along the way, we do sanity checks to make sure no one else has
805	 * touched the brk (which shouldn't happen, but it's always good to
806	 * check)
807	 *
808	 * First, make sure sbrk is sane, and store the current brk in oldbrk.
809	 */
810	oldbrk = (uintptr_t)sbrk(0);
811	if ((void *)oldbrk == (void *)-1)
812		abort();	/* sbrk is broken -- we're doomed. */
813
814	/*
815	 * Now, align the brk to a multiple of CACHE_COHERENCY_UNIT, so that
816	 * the percpu structures and cache lists will be properly aligned.
817	 *
818	 *   2.  All hunks will be page-aligned, assuming HUNKSIZE >= PAGESIZE,
819	 *	so they can be paged out individually.
820	 */
821	newbrk = ALIGN(oldbrk, CACHE_COHERENCY_UNIT);
822	if (newbrk != oldbrk && (uintptr_t)sbrk(newbrk - oldbrk) != oldbrk)
823		abort();	/* sbrk is broken -- we're doomed. */
824
825	/*
826	 * For each cpu, there is one percpu_t and a list of caches
827	 */
828	cache_space_needed = ncpus * (sizeof (percpu_t) + CACHELIST_SIZE);
829
830	new_cpu_list = (percpu_t *)sbrk(cache_space_needed);
831
832	if (new_cpu_list == (percpu_t *)-1 ||
833	    (uintptr_t)new_cpu_list != newbrk)
834		abort();	/* sbrk is broken -- we're doomed. */
835
836	/*
837	 * Finally, align the brk to HUNKSIZE so that all hunks are
838	 * page-aligned, to avoid edge-effects.
839	 */
840
841	newbrk = (uintptr_t)new_cpu_list + cache_space_needed;
842
843	padding = ALIGN(newbrk, HUNKSIZE) - newbrk;
844
845	if (padding > 0 && (uintptr_t)sbrk(padding) != newbrk)
846		abort();	/* sbrk is broken -- we're doomed. */
847
848	list_addr = ((uintptr_t)new_cpu_list + (sizeof (percpu_t) * ncpus));
849
850	/* initialize the percpu list */
851	for (i = 0; i < ncpus; i++) {
852		new_cpu_list[i].mt_caches = (cache_head_t *)list_addr;
853		for (j = 0; j < NUM_CACHES; j++) {
854			new_cpu_list[i].mt_caches[j].mt_cache = NULL;
855			new_cpu_list[i].mt_caches[j].mt_hint = NULL;
856		}
857
858		(void) mutex_init(&new_cpu_list[i].mt_parent_lock,
859		    USYNC_THREAD, NULL);
860
861		/* get the correct cache list alignment */
862		list_addr += CACHELIST_SIZE;
863	}
864
865	/*
866	 * Initialize oversize listhead
867	 */
868	oversize_list.next_bysize = &oversize_list;
869	oversize_list.prev_bysize = &oversize_list;
870	oversize_list.next_byaddr = &oversize_list;
871	oversize_list.prev_byaddr = &oversize_list;
872	oversize_list.addr = NULL;
873	oversize_list.size = 0;		/* sentinal */
874
875	/*
876	 * Now install the global variables.
877	 */
878	curcpu = new_curcpu;
879	cpu_mask = new_cpu_mask;
880	cpu_list = new_cpu_list;
881}
882
883static void
884create_cache(cache_t *cp, size_t size, uint_t chunksize)
885{
886	long nblocks;
887
888	(void) mutex_init(&cp->mt_cache_lock, USYNC_THREAD, NULL);
889	cp->mt_size = size;
890	cp->mt_freelist = ((caddr_t)cp + sizeof (cache_t));
891	cp->mt_span = chunksize * HUNKSIZE - sizeof (cache_t);
892	cp->mt_hunks = chunksize;
893	/*
894	 * rough calculation. We will need to adjust later.
895	 */
896	nblocks = cp->mt_span / cp->mt_size;
897	nblocks >>= 3;
898	if (nblocks == 0) { /* less than 8 free blocks in this pool */
899		int32_t numblocks = 0;
900		long i = cp->mt_span;
901		size_t sub = cp->mt_size;
902		uchar_t mask = 0;
903
904		while (i > sub) {
905			numblocks++;
906			i -= sub;
907		}
908		nblocks = numblocks;
909		cp->mt_arena = (caddr_t)ALIGN(cp->mt_freelist + 8, 8);
910		cp->mt_nfree = numblocks;
911		while (numblocks--) {
912			mask |= 0x80 >> numblocks;
913		}
914		*(cp->mt_freelist) = mask;
915	} else {
916		cp->mt_arena = (caddr_t)ALIGN((caddr_t)cp->mt_freelist +
917		    nblocks, 32);
918		/* recompute nblocks */
919		nblocks = (uintptr_t)((caddr_t)cp->mt_freelist +
920		    cp->mt_span - cp->mt_arena) / cp->mt_size;
921		cp->mt_nfree = ((nblocks >> 3) << 3);
922		/* Set everything to free */
923		(void) memset(cp->mt_freelist, 0xff, nblocks >> 3);
924	}
925
926	if (debugopt & MTDEBUGPATTERN)
927		copy_pattern(FREEPATTERN, cp->mt_arena, cp->mt_size * nblocks);
928
929	cp->mt_next = NULL;
930}
931
932static void
933reinit_cpu_list(void)
934{
935	oversize_t *wp = oversize_list.next_bysize;
936	percpu_t *cpuptr;
937	cache_t *thiscache;
938	cache_head_t *cachehead;
939
940	/* Reinitialize free oversize blocks. */
941	(void) mutex_lock(&oversize_lock);
942	if (debugopt & MTDEBUGPATTERN)
943		for (; wp != &oversize_list; wp = wp->next_bysize)
944			copy_pattern(FREEPATTERN, wp->addr, wp->size);
945	(void) mutex_unlock(&oversize_lock);
946
947	/* Reinitialize free blocks. */
948	for (cpuptr = &cpu_list[0]; cpuptr < &cpu_list[ncpus]; cpuptr++) {
949		(void) mutex_lock(&cpuptr->mt_parent_lock);
950		for (cachehead = &cpuptr->mt_caches[0]; cachehead <
951		    &cpuptr->mt_caches[NUM_CACHES]; cachehead++) {
952			for (thiscache = cachehead->mt_cache; thiscache != NULL;
953			    thiscache = thiscache->mt_next) {
954				(void) mutex_lock(&thiscache->mt_cache_lock);
955				if (thiscache->mt_nfree == 0) {
956					(void) mutex_unlock(
957					    &thiscache->mt_cache_lock);
958					continue;
959				}
960				if (thiscache != NULL)
961					reinit_cache(thiscache);
962				(void) mutex_unlock(&thiscache->mt_cache_lock);
963			}
964		}
965		(void) mutex_unlock(&cpuptr->mt_parent_lock);
966	}
967	reinit = 0;
968}
969
970static void
971reinit_cache(cache_t *thiscache)
972{
973	uint32_t *freeblocks; /* not a uintptr_t on purpose */
974	int32_t i, n;
975	caddr_t ret;
976
977	freeblocks = (uint32_t *)thiscache->mt_freelist;
978	while (freeblocks < (uint32_t *)thiscache->mt_arena) {
979		if (*freeblocks & 0xffffffff) {
980			for (i = 0; i < 32; i++) {
981				if (FLIP_EM(*freeblocks) & (0x80000000 >> i)) {
982					n = (uintptr_t)(((freeblocks -
983					    (uint32_t *)thiscache->mt_freelist)
984					    << 5) + i) * thiscache->mt_size;
985					ret = thiscache->mt_arena + n;
986					ret += OVERHEAD;
987					copy_pattern(FREEPATTERN, ret,
988					    thiscache->mt_size);
989				}
990			}
991		}
992		freeblocks++;
993	}
994}
995
996static void *
997malloc_internal(size_t size, percpu_t *cpuptr)
998{
999	cache_head_t *cachehead;
1000	cache_t *thiscache, *hintcache;
1001	int32_t i, n, logsz, bucket;
1002	uint32_t index;
1003	uint32_t *freeblocks; /* not a uintptr_t on purpose */
1004	caddr_t ret;
1005
1006	logsz = MIN_CACHED_SHIFT;
1007
1008	while (size > (1 << logsz))
1009		logsz++;
1010
1011	bucket = logsz - MIN_CACHED_SHIFT;
1012
1013	(void) mutex_lock(&cpuptr->mt_parent_lock);
1014
1015	/*
1016	 * Find a cache of the appropriate size with free buffers.
1017	 *
1018	 * We don't need to lock each cache as we check their mt_nfree count,
1019	 * since:
1020	 *	1.  We are only looking for caches with mt_nfree > 0.  If a
1021	 *	   free happens during our search, it will increment mt_nfree,
1022	 *	   which will not effect the test.
1023	 *	2.  Allocations can decrement mt_nfree, but they can't happen
1024	 *	   as long as we hold mt_parent_lock.
1025	 */
1026
1027	cachehead = &cpuptr->mt_caches[bucket];
1028
1029	/* Search through the list, starting at the mt_hint */
1030	thiscache = cachehead->mt_hint;
1031
1032	while (thiscache != NULL && thiscache->mt_nfree == 0)
1033		thiscache = thiscache->mt_next;
1034
1035	if (thiscache == NULL) {
1036		/* wrap around -- search up to the hint */
1037		thiscache = cachehead->mt_cache;
1038		hintcache = cachehead->mt_hint;
1039
1040		while (thiscache != NULL && thiscache != hintcache &&
1041		    thiscache->mt_nfree == 0)
1042			thiscache = thiscache->mt_next;
1043
1044		if (thiscache == hintcache)
1045			thiscache = NULL;
1046	}
1047
1048
1049	if (thiscache == NULL) { /* there are no free caches */
1050		int32_t thisrequest = requestsize;
1051		int32_t buffer_size = (1 << logsz) + OVERHEAD;
1052
1053		thiscache = (cache_t *)morecore(thisrequest * HUNKSIZE);
1054
1055		if (thiscache == (cache_t *)-1) {
1056			(void) mutex_unlock(&cpuptr->mt_parent_lock);
1057			errno = EAGAIN;
1058			return (NULL);
1059		}
1060		create_cache(thiscache, buffer_size, thisrequest);
1061
1062		/* link in the new block at the beginning of the list */
1063		thiscache->mt_next = cachehead->mt_cache;
1064		cachehead->mt_cache = thiscache;
1065	}
1066
1067	/* update the hint to the cache we found or created */
1068	cachehead->mt_hint = thiscache;
1069
1070	/* thiscache now points to a cache with available space */
1071	(void) mutex_lock(&thiscache->mt_cache_lock);
1072
1073	freeblocks = (uint32_t *)thiscache->mt_freelist;
1074	while (freeblocks < (uint32_t *)thiscache->mt_arena) {
1075		if (*freeblocks & 0xffffffff)
1076			break;
1077		freeblocks++;
1078		if (freeblocks < (uint32_t *)thiscache->mt_arena &&
1079		    *freeblocks & 0xffffffff)
1080			break;
1081		freeblocks++;
1082		if (freeblocks < (uint32_t *)thiscache->mt_arena &&
1083		    *freeblocks & 0xffffffff)
1084			break;
1085		freeblocks++;
1086		if (freeblocks < (uint32_t *)thiscache->mt_arena &&
1087		    *freeblocks & 0xffffffff)
1088			break;
1089		freeblocks++;
1090	}
1091
1092	/*
1093	 * the offset from mt_freelist to freeblocks is the offset into
1094	 * the arena. Be sure to include the offset into freeblocks
1095	 * of the bitmask. n is the offset.
1096	 */
1097	for (i = 0; i < 32; ) {
1098		if (FLIP_EM(*freeblocks) & (0x80000000 >> i++))
1099			break;
1100		if (FLIP_EM(*freeblocks) & (0x80000000 >> i++))
1101			break;
1102		if (FLIP_EM(*freeblocks) & (0x80000000 >> i++))
1103			break;
1104		if (FLIP_EM(*freeblocks) & (0x80000000 >> i++))
1105			break;
1106	}
1107	index = 0x80000000 >> --i;
1108
1109
1110	*freeblocks &= FLIP_EM(~index);
1111
1112	thiscache->mt_nfree--;
1113
1114	(void) mutex_unlock(&thiscache->mt_cache_lock);
1115	(void) mutex_unlock(&cpuptr->mt_parent_lock);
1116
1117	n = (uintptr_t)(((freeblocks - (uint32_t *)thiscache->mt_freelist) << 5)
1118	    + i) * thiscache->mt_size;
1119	/*
1120	 * Now you have the offset in n, you've changed the free mask
1121	 * in the freelist. Nothing left to do but find the block
1122	 * in the arena and put the value of thiscache in the word
1123	 * ahead of the handed out address and return the memory
1124	 * back to the user.
1125	 */
1126	ret = thiscache->mt_arena + n;
1127
1128	/* Store the cache addr for this buf. Makes free go fast. */
1129	*(uintptr_t *)ret = (uintptr_t)thiscache;
1130
1131	/*
1132	 * This assert makes sure we don't hand out memory that is not
1133	 * owned by this cache.
1134	 */
1135	assert(ret + thiscache->mt_size <= thiscache->mt_freelist +
1136	    thiscache->mt_span);
1137
1138	ret += OVERHEAD;
1139
1140	assert(((uintptr_t)ret & 7) == 0); /* are we 8 byte aligned */
1141
1142	if (reinit == 0 && (debugopt & MTDEBUGPATTERN))
1143		if (verify_pattern(FREEPATTERN, ret, size))
1144			abort();	/* reference after free */
1145
1146	if (debugopt & MTINITBUFFER)
1147		copy_pattern(INITPATTERN, ret, size);
1148	return ((void *)ret);
1149}
1150
1151static void *
1152morecore(size_t bytes)
1153{
1154	void * ret;
1155
1156	if (bytes > LONG_MAX) {
1157		intptr_t wad;
1158		/*
1159		 * The request size is too big. We need to do this in
1160		 * chunks. Sbrk only takes an int for an arg.
1161		 */
1162		if (bytes == ULONG_MAX)
1163			return ((void *)-1);
1164
1165		ret = sbrk(0);
1166		wad = LONG_MAX;
1167		while (wad > 0) {
1168			if (sbrk(wad) == (void *)-1) {
1169				if (ret != sbrk(0))
1170					(void) sbrk(-LONG_MAX);
1171				return ((void *)-1);
1172			}
1173			bytes -= LONG_MAX;
1174			wad = bytes;
1175		}
1176	} else
1177		ret = sbrk(bytes);
1178
1179	return (ret);
1180}
1181
1182
1183static void *
1184oversize(size_t size)
1185{
1186	caddr_t ret;
1187	oversize_t *big;
1188
1189	/* make sure we will not overflow */
1190	if (size > MAX_MTMALLOC) {
1191		errno = ENOMEM;
1192		return (NULL);
1193	}
1194
1195	/*
1196	 * Since we ensure every address we hand back is
1197	 * MTMALLOC_MIN_ALIGN-byte aligned, ALIGNing size ensures that the
1198	 * memory handed out is MTMALLOC_MIN_ALIGN-byte aligned at both ends.
1199	 * This eases the implementation of MTDEBUGPATTERN and MTINITPATTERN,
1200	 * particularly where coalescing occurs.
1201	 */
1202	size = ALIGN(size, MTMALLOC_MIN_ALIGN);
1203
1204	/*
1205	 * The idea with the global lock is that we are sure to
1206	 * block in the kernel anyway since given an oversize alloc
1207	 * we are sure to have to call morecore();
1208	 */
1209	(void) mutex_lock(&oversize_lock);
1210
1211	if ((big = find_oversize(size)) != NULL) {
1212		if (reinit == 0 && (debugopt & MTDEBUGPATTERN))
1213			if (verify_pattern(FREEPATTERN, big->addr, size))
1214				abort();	/* reference after free */
1215	} else {
1216		/* Get more 8-byte aligned memory from heap */
1217		ret = morecore(size + OVSZ_HEADER_SIZE);
1218		if (ret == (caddr_t)-1) {
1219			(void) mutex_unlock(&oversize_lock);
1220			errno = ENOMEM;
1221			return (NULL);
1222		}
1223		big = oversize_header_alloc((uintptr_t)ret, size);
1224	}
1225	ret = big->addr;
1226
1227	insert_hash(big);
1228
1229	if (debugopt & MTINITBUFFER)
1230		copy_pattern(INITPATTERN, ret, size);
1231
1232	(void) mutex_unlock(&oversize_lock);
1233	assert(((uintptr_t)ret & 7) == 0); /* are we 8 byte aligned */
1234	return ((void *)ret);
1235}
1236
1237static void
1238insert_oversize(oversize_t *op, oversize_t *nx)
1239{
1240	oversize_t *sp;
1241
1242	/* locate correct insertion point in size-ordered list */
1243	for (sp = oversize_list.next_bysize;
1244	    sp != &oversize_list && (op->size > sp->size);
1245	    sp = sp->next_bysize)
1246		;
1247
1248	/* link into size-ordered list */
1249	op->next_bysize = sp;
1250	op->prev_bysize = sp->prev_bysize;
1251	op->prev_bysize->next_bysize = op;
1252	op->next_bysize->prev_bysize = op;
1253
1254	/*
1255	 * link item into address-ordered list
1256	 * (caller provides insertion point as an optimization)
1257	 */
1258	op->next_byaddr = nx;
1259	op->prev_byaddr = nx->prev_byaddr;
1260	op->prev_byaddr->next_byaddr = op;
1261	op->next_byaddr->prev_byaddr = op;
1262
1263}
1264
1265static void
1266unlink_oversize(oversize_t *lp)
1267{
1268	/* unlink from address list */
1269	lp->prev_byaddr->next_byaddr = lp->next_byaddr;
1270	lp->next_byaddr->prev_byaddr = lp->prev_byaddr;
1271
1272	/* unlink from size list */
1273	lp->prev_bysize->next_bysize = lp->next_bysize;
1274	lp->next_bysize->prev_bysize = lp->prev_bysize;
1275}
1276
1277static void
1278position_oversize_by_size(oversize_t *op)
1279{
1280	oversize_t *sp;
1281
1282	if (op->size > op->next_bysize->size ||
1283	    op->size < op->prev_bysize->size) {
1284
1285		/* unlink from size list */
1286		op->prev_bysize->next_bysize = op->next_bysize;
1287		op->next_bysize->prev_bysize = op->prev_bysize;
1288
1289		/* locate correct insertion point in size-ordered list */
1290		for (sp = oversize_list.next_bysize;
1291		    sp != &oversize_list && (op->size > sp->size);
1292		    sp = sp->next_bysize)
1293			;
1294
1295		/* link into size-ordered list */
1296		op->next_bysize = sp;
1297		op->prev_bysize = sp->prev_bysize;
1298		op->prev_bysize->next_bysize = op;
1299		op->next_bysize->prev_bysize = op;
1300	}
1301}
1302
1303static void
1304add_oversize(oversize_t *lp)
1305{
1306	int merge_flags = INSERT_ONLY;
1307	oversize_t *nx;  	/* ptr to item right of insertion point */
1308	oversize_t *pv;  	/* ptr to item left of insertion point */
1309	uint_t size_lp, size_pv, size_nx;
1310	uintptr_t endp_lp, endp_pv, endp_nx;
1311
1312	/*
1313	 * Locate insertion point in address-ordered list
1314	 */
1315
1316	for (nx = oversize_list.next_byaddr;
1317	    nx != &oversize_list && (lp->addr > nx->addr);
1318	    nx = nx->next_byaddr)
1319		;
1320
1321	/*
1322	 * Determine how to add chunk to oversize freelist
1323	 */
1324
1325	size_lp = OVSZ_HEADER_SIZE + lp->size;
1326	endp_lp = ALIGN((uintptr_t)lp + size_lp, MTMALLOC_MIN_ALIGN);
1327	size_lp = endp_lp - (uintptr_t)lp;
1328
1329	pv = nx->prev_byaddr;
1330
1331	if (pv->size) {
1332
1333		size_pv = OVSZ_HEADER_SIZE + pv->size;
1334		endp_pv = ALIGN((uintptr_t)pv + size_pv,
1335		    MTMALLOC_MIN_ALIGN);
1336		size_pv = endp_pv - (uintptr_t)pv;
1337
1338		/* Check for adjacency with left chunk */
1339		if ((uintptr_t)lp == endp_pv)
1340			merge_flags |= COALESCE_LEFT;
1341	}
1342
1343	if (nx->size) {
1344
1345		/* Check for adjacency with right chunk */
1346		if ((uintptr_t)nx == endp_lp) {
1347			size_nx = OVSZ_HEADER_SIZE + nx->size;
1348			endp_nx = ALIGN((uintptr_t)nx + size_nx,
1349			    MTMALLOC_MIN_ALIGN);
1350			size_nx = endp_nx - (uintptr_t)nx;
1351			merge_flags |= COALESCE_RIGHT;
1352		}
1353	}
1354
1355	/*
1356	 * If MTDEBUGPATTERN==1, lp->addr will have been overwritten with
1357	 * FREEPATTERN for lp->size bytes. If we can merge, the oversize
1358	 * header(s) that will also become part of the memory available for
1359	 * reallocation (ie lp and/or nx) must also be overwritten with
1360	 * FREEPATTERN or we will SIGABRT when this memory is next reallocated.
1361	 */
1362	switch (merge_flags) {
1363
1364	case INSERT_ONLY:		/* Coalescing not possible */
1365		insert_oversize(lp, nx);
1366		break;
1367	case COALESCE_LEFT:
1368		pv->size += size_lp;
1369		position_oversize_by_size(pv);
1370		if (debugopt & MTDEBUGPATTERN)
1371			copy_pattern(FREEPATTERN, lp, OVSZ_HEADER_SIZE);
1372		break;
1373	case COALESCE_RIGHT:
1374		unlink_oversize(nx);
1375		lp->size += size_nx;
1376		insert_oversize(lp, pv->next_byaddr);
1377		if (debugopt & MTDEBUGPATTERN)
1378			copy_pattern(FREEPATTERN, nx, OVSZ_HEADER_SIZE);
1379		break;
1380	case COALESCE_WITH_BOTH_SIDES:	/* Merge (with right) to the left */
1381		pv->size += size_lp + size_nx;
1382		unlink_oversize(nx);
1383		position_oversize_by_size(pv);
1384		if (debugopt & MTDEBUGPATTERN) {
1385			copy_pattern(FREEPATTERN, lp, OVSZ_HEADER_SIZE);
1386			copy_pattern(FREEPATTERN, nx, OVSZ_HEADER_SIZE);
1387		}
1388		break;
1389	}
1390}
1391
1392/*
1393 * Find memory on our list that is at least size big. If we find a block that is
1394 * big enough, we break it up and return the associated oversize_t struct back
1395 * to the calling client. Any leftover piece of that block is returned to the
1396 * freelist.
1397 */
1398static oversize_t *
1399find_oversize(size_t size)
1400{
1401	oversize_t *wp = oversize_list.next_bysize;
1402	while (wp != &oversize_list && size > wp->size)
1403		wp = wp->next_bysize;
1404
1405	if (wp == &oversize_list) /* empty list or nothing big enough */
1406		return (NULL);
1407	/* breaking up a chunk of memory */
1408	if ((long)((wp->size - (size + OVSZ_HEADER_SIZE + MTMALLOC_MIN_ALIGN)))
1409	    > MAX_CACHED) {
1410		caddr_t off;
1411		oversize_t *np;
1412		size_t osize;
1413		off = (caddr_t)ALIGN(wp->addr + size,
1414		    MTMALLOC_MIN_ALIGN);
1415		osize = wp->size;
1416		wp->size = (size_t)(off - wp->addr);
1417		np = oversize_header_alloc((uintptr_t)off,
1418		    osize - (wp->size + OVSZ_HEADER_SIZE));
1419		if ((long)np->size < 0)
1420			abort();
1421		unlink_oversize(wp);
1422		add_oversize(np);
1423	} else {
1424		unlink_oversize(wp);
1425	}
1426	return (wp);
1427}
1428
1429static void
1430copy_pattern(uint32_t pattern, void *buf_arg, size_t size)
1431{
1432	uint32_t *bufend = (uint32_t *)((char *)buf_arg + size);
1433	uint32_t *buf = buf_arg;
1434
1435	while (buf < bufend - 3) {
1436		buf[3] = buf[2] = buf[1] = buf[0] = pattern;
1437		buf += 4;
1438	}
1439	while (buf < bufend)
1440		*buf++ = pattern;
1441}
1442
1443static void *
1444verify_pattern(uint32_t pattern, void *buf_arg, size_t size)
1445{
1446	uint32_t *bufend = (uint32_t *)((char *)buf_arg + size);
1447	uint32_t *buf;
1448
1449	for (buf = buf_arg; buf < bufend; buf++)
1450		if (*buf != pattern)
1451			return (buf);
1452	return (NULL);
1453}
1454
1455static void
1456free_oversize(oversize_t *ovp)
1457{
1458	assert(((uintptr_t)ovp->addr & 7) == 0); /* are we 8 byte aligned */
1459	assert(ovp->size > MAX_CACHED);
1460
1461	ovp->next_bysize = ovp->prev_bysize = NULL;
1462	ovp->next_byaddr = ovp->prev_byaddr = NULL;
1463	(void) mutex_lock(&oversize_lock);
1464	add_oversize(ovp);
1465	(void) mutex_unlock(&oversize_lock);
1466}
1467
1468static oversize_t *
1469oversize_header_alloc(uintptr_t mem, size_t size)
1470{
1471	oversize_t *ovsz_hdr;
1472
1473	assert(size > MAX_CACHED);
1474
1475	ovsz_hdr = (oversize_t *)mem;
1476	ovsz_hdr->prev_bysize = NULL;
1477	ovsz_hdr->next_bysize = NULL;
1478	ovsz_hdr->prev_byaddr = NULL;
1479	ovsz_hdr->next_byaddr = NULL;
1480	ovsz_hdr->hash_next = NULL;
1481	ovsz_hdr->size = size;
1482	mem += OVSZ_SIZE;
1483	*(uintptr_t *)mem = MTMALLOC_OVERSIZE_MAGIC;
1484	mem += OVERHEAD;
1485	assert(((uintptr_t)mem & 7) == 0); /* are we 8 byte aligned */
1486	ovsz_hdr->addr = (caddr_t)mem;
1487	return (ovsz_hdr);
1488}
1489
1490static void
1491malloc_prepare()
1492{
1493	percpu_t *cpuptr;
1494	cache_head_t *cachehead;
1495	cache_t *thiscache;
1496
1497	(void) mutex_lock(&oversize_lock);
1498	for (cpuptr = &cpu_list[0]; cpuptr < &cpu_list[ncpus]; cpuptr++) {
1499		(void) mutex_lock(&cpuptr->mt_parent_lock);
1500		for (cachehead = &cpuptr->mt_caches[0];
1501		    cachehead < &cpuptr->mt_caches[NUM_CACHES];
1502		    cachehead++) {
1503			for (thiscache = cachehead->mt_cache;
1504			    thiscache != NULL;
1505			    thiscache = thiscache->mt_next) {
1506				(void) mutex_lock(
1507				    &thiscache->mt_cache_lock);
1508			}
1509		}
1510	}
1511}
1512
1513static void
1514malloc_release()
1515{
1516	percpu_t *cpuptr;
1517	cache_head_t *cachehead;
1518	cache_t *thiscache;
1519
1520	for (cpuptr = &cpu_list[ncpus - 1]; cpuptr >= &cpu_list[0]; cpuptr--) {
1521		for (cachehead = &cpuptr->mt_caches[NUM_CACHES - 1];
1522		    cachehead >= &cpuptr->mt_caches[0];
1523		    cachehead--) {
1524			for (thiscache = cachehead->mt_cache;
1525			    thiscache != NULL;
1526			    thiscache = thiscache->mt_next) {
1527				(void) mutex_unlock(
1528				    &thiscache->mt_cache_lock);
1529			}
1530		}
1531		(void) mutex_unlock(&cpuptr->mt_parent_lock);
1532	}
1533	(void) mutex_unlock(&oversize_lock);
1534}
1535
1536#pragma init(malloc_init)
1537static void
1538malloc_init(void)
1539{
1540	/*
1541	 * This works in the init section for this library
1542	 * because setup_caches() doesn't call anything in libc
1543	 * that calls malloc().  If it did, disaster would ensue.
1544	 *
1545	 * For this to work properly, this library must be the first
1546	 * one to have its init section called (after libc) by the
1547	 * dynamic linker.  If some other library's init section
1548	 * ran first and called malloc(), disaster would ensue.
1549	 * Because this is an interposer library for malloc(), the
1550	 * dynamic linker arranges for its init section to run first.
1551	 */
1552	(void) setup_caches();
1553
1554	(void) pthread_atfork(malloc_prepare, malloc_release, malloc_release);
1555}
1556