memory.c revision 331769
1/*
2 * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
3 * Copyright (c) 2006 Cisco Systems, Inc.  All rights reserved.
4 *
5 * This software is available to you under a choice of one of two
6 * licenses.  You may choose to be licensed under the terms of the GNU
7 * General Public License (GPL) Version 2, available from the file
8 * COPYING in the main directory of this source tree, or the
9 * OpenIB.org BSD license below:
10 *
11 *     Redistribution and use in source and binary forms, with or
12 *     without modification, are permitted provided that the following
13 *     conditions are met:
14 *
15 *      - Redistributions of source code must retain the above
16 *        copyright notice, this list of conditions and the following
17 *        disclaimer.
18 *
19 *      - Redistributions in binary form must reproduce the above
20 *        copyright notice, this list of conditions and the following
21 *        disclaimer in the documentation and/or other materials
22 *        provided with the distribution.
23 *
24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
31 * SOFTWARE.
32 */
33
34#include <config.h>
35
36#include <errno.h>
37#include <sys/mman.h>
38#include <unistd.h>
39#include <stdlib.h>
40#include <stdint.h>
41#include <stdio.h>
42#include <string.h>
43#include <dirent.h>
44#include <limits.h>
45#include <inttypes.h>
46
47#include "ibverbs.h"
48
49struct ibv_mem_node {
50	enum {
51		IBV_RED,
52		IBV_BLACK
53	}			color;
54	struct ibv_mem_node    *parent;
55	struct ibv_mem_node    *left, *right;
56	uintptr_t		start, end;
57	int			refcnt;
58};
59
60static struct ibv_mem_node *mm_root;
61static pthread_mutex_t mm_mutex = PTHREAD_MUTEX_INITIALIZER;
62static int page_size;
63static int huge_page_enabled;
64static int too_late;
65
66static unsigned long smaps_page_size(FILE *file)
67{
68	int n;
69	unsigned long size = page_size;
70	char buf[1024];
71
72	while (fgets(buf, sizeof(buf), file) != NULL) {
73		if (!strstr(buf, "KernelPageSize:"))
74			continue;
75
76		n = sscanf(buf, "%*s %lu", &size);
77		if (n < 1)
78			continue;
79
80		/* page size is printed in Kb */
81		size = size * 1024;
82
83		break;
84	}
85
86	return size;
87}
88
89static unsigned long get_page_size(void *base)
90{
91	unsigned long ret = page_size;
92	pid_t pid;
93	FILE *file;
94	char buf[1024];
95
96	pid = getpid();
97	snprintf(buf, sizeof(buf), "/proc/%d/smaps", pid);
98
99	file = fopen(buf, "r" STREAM_CLOEXEC);
100	if (!file)
101		goto out;
102
103	while (fgets(buf, sizeof(buf), file) != NULL) {
104		int n;
105		uintptr_t range_start, range_end;
106
107		n = sscanf(buf, "%" SCNxPTR "-%" SCNxPTR, &range_start, &range_end);
108
109		if (n < 2)
110			continue;
111
112		if ((uintptr_t) base >= range_start && (uintptr_t) base < range_end) {
113			ret = smaps_page_size(file);
114			break;
115		}
116	}
117
118	fclose(file);
119
120out:
121	return ret;
122}
123
124int ibv_fork_init(void)
125{
126	void *tmp, *tmp_aligned;
127	int ret;
128	unsigned long size;
129
130	if (getenv("RDMAV_HUGEPAGES_SAFE"))
131		huge_page_enabled = 1;
132
133	if (mm_root)
134		return 0;
135
136	if (too_late)
137		return EINVAL;
138
139	page_size = sysconf(_SC_PAGESIZE);
140	if (page_size < 0)
141		return errno;
142
143	if (posix_memalign(&tmp, page_size, page_size))
144		return ENOMEM;
145
146	if (huge_page_enabled) {
147		size = get_page_size(tmp);
148		tmp_aligned = (void *) ((uintptr_t) tmp & ~(size - 1));
149	} else {
150		size = page_size;
151		tmp_aligned = tmp;
152	}
153
154	ret = madvise(tmp_aligned, size, MADV_DONTFORK) ||
155	      madvise(tmp_aligned, size, MADV_DOFORK);
156
157	free(tmp);
158
159	if (ret)
160		return ENOSYS;
161
162	mm_root = malloc(sizeof *mm_root);
163	if (!mm_root)
164		return ENOMEM;
165
166	mm_root->parent = NULL;
167	mm_root->left   = NULL;
168	mm_root->right  = NULL;
169	mm_root->color  = IBV_BLACK;
170	mm_root->start  = 0;
171	mm_root->end    = UINTPTR_MAX;
172	mm_root->refcnt = 0;
173
174	return 0;
175}
176
177static struct ibv_mem_node *__mm_prev(struct ibv_mem_node *node)
178{
179	if (node->left) {
180		node = node->left;
181		while (node->right)
182			node = node->right;
183	} else {
184		while (node->parent && node == node->parent->left)
185			node = node->parent;
186
187		node = node->parent;
188	}
189
190	return node;
191}
192
193static struct ibv_mem_node *__mm_next(struct ibv_mem_node *node)
194{
195	if (node->right) {
196		node = node->right;
197		while (node->left)
198			node = node->left;
199	} else {
200		while (node->parent && node == node->parent->right)
201			node = node->parent;
202
203		node = node->parent;
204	}
205
206	return node;
207}
208
209static void __mm_rotate_right(struct ibv_mem_node *node)
210{
211	struct ibv_mem_node *tmp;
212
213	tmp = node->left;
214
215	node->left = tmp->right;
216	if (node->left)
217		node->left->parent = node;
218
219	if (node->parent) {
220		if (node->parent->right == node)
221			node->parent->right = tmp;
222		else
223			node->parent->left = tmp;
224	} else
225		mm_root = tmp;
226
227	tmp->parent = node->parent;
228
229	tmp->right = node;
230	node->parent = tmp;
231}
232
233static void __mm_rotate_left(struct ibv_mem_node *node)
234{
235	struct ibv_mem_node *tmp;
236
237	tmp = node->right;
238
239	node->right = tmp->left;
240	if (node->right)
241		node->right->parent = node;
242
243	if (node->parent) {
244		if (node->parent->right == node)
245			node->parent->right = tmp;
246		else
247			node->parent->left = tmp;
248	} else
249		mm_root = tmp;
250
251	tmp->parent = node->parent;
252
253	tmp->left = node;
254	node->parent = tmp;
255}
256
257#if 0
258static int verify(struct ibv_mem_node *node)
259{
260	int hl, hr;
261
262	if (!node)
263		return 1;
264
265	hl = verify(node->left);
266	hr = verify(node->left);
267
268	if (!hl || !hr)
269		return 0;
270	if (hl != hr)
271		return 0;
272
273	if (node->color == IBV_RED) {
274		if (node->left && node->left->color != IBV_BLACK)
275			return 0;
276		if (node->right && node->right->color != IBV_BLACK)
277			return 0;
278		return hl;
279	}
280
281	return hl + 1;
282}
283#endif
284
285static void __mm_add_rebalance(struct ibv_mem_node *node)
286{
287	struct ibv_mem_node *parent, *gp, *uncle;
288
289	while (node->parent && node->parent->color == IBV_RED) {
290		parent = node->parent;
291		gp     = node->parent->parent;
292
293		if (parent == gp->left) {
294			uncle = gp->right;
295
296			if (uncle && uncle->color == IBV_RED) {
297				parent->color = IBV_BLACK;
298				uncle->color  = IBV_BLACK;
299				gp->color     = IBV_RED;
300
301				node = gp;
302			} else {
303				if (node == parent->right) {
304					__mm_rotate_left(parent);
305					node   = parent;
306					parent = node->parent;
307				}
308
309				parent->color = IBV_BLACK;
310				gp->color     = IBV_RED;
311
312				__mm_rotate_right(gp);
313			}
314		} else {
315			uncle = gp->left;
316
317			if (uncle && uncle->color == IBV_RED) {
318				parent->color = IBV_BLACK;
319				uncle->color  = IBV_BLACK;
320				gp->color     = IBV_RED;
321
322				node = gp;
323			} else {
324				if (node == parent->left) {
325					__mm_rotate_right(parent);
326					node   = parent;
327					parent = node->parent;
328				}
329
330				parent->color = IBV_BLACK;
331				gp->color     = IBV_RED;
332
333				__mm_rotate_left(gp);
334			}
335		}
336	}
337
338	mm_root->color = IBV_BLACK;
339}
340
341static void __mm_add(struct ibv_mem_node *new)
342{
343	struct ibv_mem_node *node, *parent = NULL;
344
345	node = mm_root;
346	while (node) {
347		parent = node;
348		if (node->start < new->start)
349			node = node->right;
350		else
351			node = node->left;
352	}
353
354	if (parent->start < new->start)
355		parent->right = new;
356	else
357		parent->left = new;
358
359	new->parent = parent;
360	new->left   = NULL;
361	new->right  = NULL;
362
363	new->color = IBV_RED;
364	__mm_add_rebalance(new);
365}
366
367static void __mm_remove(struct ibv_mem_node *node)
368{
369	struct ibv_mem_node *child, *parent, *sib, *tmp;
370	int nodecol;
371
372	if (node->left && node->right) {
373		tmp = node->left;
374		while (tmp->right)
375			tmp = tmp->right;
376
377		nodecol    = tmp->color;
378		child      = tmp->left;
379		tmp->color = node->color;
380
381		if (tmp->parent != node) {
382			parent        = tmp->parent;
383			parent->right = tmp->left;
384			if (tmp->left)
385				tmp->left->parent = parent;
386
387			tmp->left   	   = node->left;
388			node->left->parent = tmp;
389		} else
390			parent = tmp;
391
392		tmp->right          = node->right;
393		node->right->parent = tmp;
394
395		tmp->parent = node->parent;
396		if (node->parent) {
397			if (node->parent->left == node)
398				node->parent->left = tmp;
399			else
400				node->parent->right = tmp;
401		} else
402			mm_root = tmp;
403	} else {
404		nodecol = node->color;
405
406		child  = node->left ? node->left : node->right;
407		parent = node->parent;
408
409		if (child)
410			child->parent = parent;
411		if (parent) {
412			if (parent->left == node)
413				parent->left = child;
414			else
415				parent->right = child;
416		} else
417			mm_root = child;
418	}
419
420	free(node);
421
422	if (nodecol == IBV_RED)
423		return;
424
425	while ((!child || child->color == IBV_BLACK) && child != mm_root) {
426		if (parent->left == child) {
427			sib = parent->right;
428
429			if (sib->color == IBV_RED) {
430				parent->color = IBV_RED;
431				sib->color    = IBV_BLACK;
432				__mm_rotate_left(parent);
433				sib = parent->right;
434			}
435
436			if ((!sib->left  || sib->left->color  == IBV_BLACK) &&
437			    (!sib->right || sib->right->color == IBV_BLACK)) {
438				sib->color = IBV_RED;
439				child  = parent;
440				parent = child->parent;
441			} else {
442				if (!sib->right || sib->right->color == IBV_BLACK) {
443					if (sib->left)
444						sib->left->color = IBV_BLACK;
445					sib->color = IBV_RED;
446					__mm_rotate_right(sib);
447					sib = parent->right;
448				}
449
450				sib->color    = parent->color;
451				parent->color = IBV_BLACK;
452				if (sib->right)
453					sib->right->color = IBV_BLACK;
454				__mm_rotate_left(parent);
455				child = mm_root;
456				break;
457			}
458		} else {
459			sib = parent->left;
460
461			if (sib->color == IBV_RED) {
462				parent->color = IBV_RED;
463				sib->color    = IBV_BLACK;
464				__mm_rotate_right(parent);
465				sib = parent->left;
466			}
467
468			if ((!sib->left  || sib->left->color  == IBV_BLACK) &&
469			    (!sib->right || sib->right->color == IBV_BLACK)) {
470				sib->color = IBV_RED;
471				child  = parent;
472				parent = child->parent;
473			} else {
474				if (!sib->left || sib->left->color == IBV_BLACK) {
475					if (sib->right)
476						sib->right->color = IBV_BLACK;
477					sib->color = IBV_RED;
478					__mm_rotate_left(sib);
479					sib = parent->left;
480				}
481
482				sib->color    = parent->color;
483				parent->color = IBV_BLACK;
484				if (sib->left)
485					sib->left->color = IBV_BLACK;
486				__mm_rotate_right(parent);
487				child = mm_root;
488				break;
489			}
490		}
491	}
492
493	if (child)
494		child->color = IBV_BLACK;
495}
496
497static struct ibv_mem_node *__mm_find_start(uintptr_t start, uintptr_t end)
498{
499	struct ibv_mem_node *node = mm_root;
500
501	while (node) {
502		if (node->start <= start && node->end >= start)
503			break;
504
505		if (node->start < start)
506			node = node->right;
507		else
508			node = node->left;
509	}
510
511	return node;
512}
513
514static struct ibv_mem_node *merge_ranges(struct ibv_mem_node *node,
515					 struct ibv_mem_node *prev)
516{
517	prev->end = node->end;
518	prev->refcnt = node->refcnt;
519	__mm_remove(node);
520
521	return prev;
522}
523
524static struct ibv_mem_node *split_range(struct ibv_mem_node *node,
525					uintptr_t cut_line)
526{
527	struct ibv_mem_node *new_node = NULL;
528
529	new_node = malloc(sizeof *new_node);
530	if (!new_node)
531		return NULL;
532	new_node->start  = cut_line;
533	new_node->end    = node->end;
534	new_node->refcnt = node->refcnt;
535	node->end  = cut_line - 1;
536	__mm_add(new_node);
537
538	return new_node;
539}
540
541static struct ibv_mem_node *get_start_node(uintptr_t start, uintptr_t end,
542					   int inc)
543{
544	struct ibv_mem_node *node, *tmp = NULL;
545
546	node = __mm_find_start(start, end);
547	if (node->start < start)
548		node = split_range(node, start);
549	else {
550		tmp = __mm_prev(node);
551		if (tmp && tmp->refcnt == node->refcnt + inc)
552			node = merge_ranges(node, tmp);
553	}
554	return node;
555}
556
557/*
558 * This function is called if madvise() fails to undo merging/splitting
559 * operations performed on the node.
560 */
561static struct ibv_mem_node *undo_node(struct ibv_mem_node *node,
562				      uintptr_t start, int inc)
563{
564	struct ibv_mem_node *tmp = NULL;
565
566	/*
567	 * This condition can be true only if we merged this
568	 * node with the previous one, so we need to split them.
569	*/
570	if (start > node->start) {
571		tmp = split_range(node, start);
572		if (tmp) {
573			node->refcnt += inc;
574			node = tmp;
575		} else
576			return NULL;
577	}
578
579	tmp  =  __mm_prev(node);
580	if (tmp && tmp->refcnt == node->refcnt)
581		node = merge_ranges(node, tmp);
582
583	tmp  =  __mm_next(node);
584	if (tmp && tmp->refcnt == node->refcnt)
585		node = merge_ranges(tmp, node);
586
587	return node;
588}
589
590static int ibv_madvise_range(void *base, size_t size, int advice)
591{
592	uintptr_t start, end;
593	struct ibv_mem_node *node, *tmp;
594	int inc;
595	int rolling_back = 0;
596	int ret = 0;
597	unsigned long range_page_size;
598
599	if (!size)
600		return 0;
601
602	if (huge_page_enabled)
603		range_page_size = get_page_size(base);
604	else
605		range_page_size = page_size;
606
607	start = (uintptr_t) base & ~(range_page_size - 1);
608	end   = ((uintptr_t) (base + size + range_page_size - 1) &
609		 ~(range_page_size - 1)) - 1;
610
611	pthread_mutex_lock(&mm_mutex);
612again:
613	inc = advice == MADV_DONTFORK ? 1 : -1;
614
615	node = get_start_node(start, end, inc);
616	if (!node) {
617		ret = -1;
618		goto out;
619	}
620
621	while (node && node->start <= end) {
622		if (node->end > end) {
623			if (!split_range(node, end + 1)) {
624				ret = -1;
625				goto out;
626			}
627		}
628
629		if ((inc == -1 && node->refcnt == 1) ||
630		    (inc ==  1 && node->refcnt == 0)) {
631			/*
632			 * If this is the first time through the loop,
633			 * and we merged this node with the previous
634			 * one, then we only want to do the madvise()
635			 * on start ... node->end (rather than
636			 * starting at node->start).
637			 *
638			 * Otherwise we end up doing madvise() on
639			 * bigger region than we're being asked to,
640			 * and that may lead to a spurious failure.
641			 */
642			if (start > node->start)
643				ret = madvise((void *) start, node->end - start + 1,
644					      advice);
645			else
646				ret = madvise((void *) node->start,
647					      node->end - node->start + 1,
648					      advice);
649			if (ret) {
650				node = undo_node(node, start, inc);
651
652				if (rolling_back || !node)
653					goto out;
654
655				/* madvise failed, roll back previous changes */
656				rolling_back = 1;
657				advice = advice == MADV_DONTFORK ?
658					MADV_DOFORK : MADV_DONTFORK;
659				tmp = __mm_prev(node);
660				if (!tmp || start > tmp->end)
661					goto out;
662				end = tmp->end;
663				goto again;
664			}
665		}
666
667		node->refcnt += inc;
668		node = __mm_next(node);
669	}
670
671	if (node) {
672		tmp = __mm_prev(node);
673		if (tmp && node->refcnt == tmp->refcnt)
674			node = merge_ranges(node, tmp);
675	}
676
677out:
678	if (rolling_back)
679		ret = -1;
680
681	pthread_mutex_unlock(&mm_mutex);
682
683	return ret;
684}
685
686int ibv_dontfork_range(void *base, size_t size)
687{
688	if (mm_root)
689		return ibv_madvise_range(base, size, MADV_DONTFORK);
690	else {
691		too_late = 1;
692		return 0;
693	}
694}
695
696int ibv_dofork_range(void *base, size_t size)
697{
698	if (mm_root)
699		return ibv_madvise_range(base, size, MADV_DOFORK);
700	else {
701		too_late = 1;
702		return 0;
703	}
704}
705