1/**
2 * \file
3 * \brief General Numa functions
4 *
5 */
6
7/*
8 * Copyright (c) 2014, ETH Zurich.
9 * All rights reserved.
10 *
11 * This file is distributed under the terms in the attached LICENSE file.
12 * If you do not find this file, copies can be found by writing to:
13 * ETH Zurich D-INFK, Universitaetstr. 6, CH-8092 Zurich. Attn: Systems Group.
14 */
15
16#include <stdio.h>
17#include <string.h>
18
19#include <barrelfish/barrelfish.h>
20
21#include <numa.h>
22#include <bitmap.h>
23#include "numa_internal.h"
24
25///< numa interleave mask for allocations
26struct bitmap *numa_alloc_interleave_mask;
27
28///< numa bind mask for allocations
29struct bitmap *numa_alloc_bind_mask;
30
31/**
32 * \brief validates the given page size and sets the flags
33 *
34 * \param pagesize  desired page size
35 * \param flags     returns the VREGION_FLAGS_*
36 *
37 * \returns activated page size
38 *
39 * if the page size is not known or not supported, BASE_PAGE_SIZE is returned
40 */
41static size_t validate_page_size(size_t pagesize, vregion_flags_t *flags)
42{
43#if defined(__x86_64__) || defined(__aarch64__)
44    /* use huge, large or base pages on 64 bits */
45    switch(pagesize) {
46        case LARGE_PAGE_SIZE:
47            *flags = VREGION_FLAGS_READ_WRITE | VREGION_FLAGS_LARGE;
48            return LARGE_PAGE_SIZE;
49        case HUGE_PAGE_SIZE:
50            *flags = VREGION_FLAGS_READ_WRITE | VREGION_FLAGS_HUGE;
51            return HUGE_PAGE_SIZE;
52        case BASE_PAGE_SIZE:
53        default:
54            *flags = VREGION_FLAGS_READ_WRITE;
55            return BASE_PAGE_SIZE;
56    }
57#elif defined(__x86_32__) || (defined(__arm__) && !defined(__aarch64__))
58    /* use base or large pages on 32bits */
59    switch(pagesize) {
60        case LARGE_PAGE_SIZE:
61            *flags = VREGION_FLAGS_READ_WRITE | VREGION_FLAGS_LARGE;
62            return LARGE_PAGE_SIZE;
63        case BASE_PAGE_SIZE:
64        default:
65            *flags = VREGION_FLAGS_READ_WRITE;
66            return BASE_PAGE_SIZE;
67    }
68#else
69    /* falling back to base page sizes uf unknown architecture */
70    *flags = VREGION_FLAGS_READ_WRITE;
71    return BASE_PAGE_SIZE;
72#endif
73}
74
75
76/** \brief   returns the current interleave mask
77 *
78 * \returns bitmask representing the current interleave state
79 *
80 * returns the current interleave mask if the task's memory allocation policy is
81 * page interleaved. Otherwise, this function returns an empty mask.
82 */
83struct bitmap *numa_get_interleave_mask(void)
84{
85    assert(numa_alloc_interleave_mask);
86    struct bitmap *im = numa_allocate_nodemask();
87    if (im == NULL) {
88        return NULL;
89    }
90    bitmap_copy(im, numa_alloc_interleave_mask);
91    return im;
92}
93
94
95/**
96 * \brief sets the memory interleave mask for the current task to nodemask
97 *
98 * \param nodemask bitmask representing the nodes
99 *
100 * All new memory allocations are page interleaved over all nodes in the interleave
101 * mask. Interleaving can be turned off again by passing an empty mask.
102 *
103 * This bitmask is considered to be a hint. Fallback to other nodes may be possible
104 */
105void numa_set_interleave_mask(struct bitmap *nodemask)
106{
107    assert(numa_alloc_interleave_mask);
108
109    if (!nodemask) {
110        bitmap_clear_all(numa_alloc_interleave_mask);
111        return;
112    }
113
114    if (bitmap_get_nbits(nodemask) < NUMA_MAX_NUMNODES) {
115        NUMA_WARNING("supplied interleave mask (%p) has to less bits!", nodemask);
116        return;
117    }
118    bitmap_copy(numa_alloc_interleave_mask, nodemask);
119
120    /* clear out the invalid nodes */
121    bitmap_clear_range(numa_alloc_interleave_mask, numa_num_configured_nodes(),
122                       bitmap_get_nbits(numa_alloc_interleave_mask));
123
124    /* clear the bind mask as we are using interleaving mode now */
125    bitmap_clear_all(numa_alloc_bind_mask);
126}
127
128
129/**
130 * \brief binds the current task and its children to the nodes specified in nodemask.
131 *
132 * \param nodemask  bitmap representing the nodes
133 */
134void numa_bind(struct bitmap *nodemask)
135{
136    USER_PANIC("Not yet implemented");
137}
138
139
140/**
141 * \brief sets the memory allocation policy for the calling task to local allocation.
142 */
143void numa_set_localalloc(void)
144{
145    assert(numa_alloc_bind_mask);
146    assert(numa_alloc_interleave_mask);
147
148    /* clear interleave mode */
149    bitmap_clear_all(numa_alloc_interleave_mask);
150
151    bitmap_clear_all(numa_alloc_bind_mask);
152    bitmap_set_bit(numa_alloc_bind_mask, numa_current_node());
153}
154
155/**
156 * \brief sets the memory allocation mask.
157 *
158 * \param nodemask  bitmap representing the nodes
159 *
160 * The task will only allocate memory from the nodes set in nodemask.
161 *
162 * an empty mask or not allowed nodes in the mask will result in an error
163 */
164errval_t numa_set_membind(struct bitmap *nodemask)
165{
166    assert(numa_alloc_bind_mask);
167    assert(numa_alloc_interleave_mask);
168
169    if (!nodemask) {
170        return NUMA_ERR_BITMAP_PARSE;
171    }
172
173    if (bitmap_get_nbits(nodemask) < NUMA_MAX_NUMNODES) {
174        NUMA_WARNING("supplied interleave mask (%p) has to less bits!", nodemask);
175        return NUMA_ERR_BITMAP_RANGE;
176    }
177
178    /* copy new membind mask and clear out invalid bits */
179    bitmap_copy(numa_alloc_bind_mask, nodemask);
180    bitmap_clear_range(numa_alloc_bind_mask, numa_num_configured_nodes(),
181                       bitmap_get_nbits(numa_alloc_bind_mask));
182
183    if (bitmap_get_weight(numa_alloc_bind_mask) == 0) {
184        /* cannot bind to no node, restore with all nodes pointer*/
185        bitmap_copy(numa_alloc_bind_mask, numa_all_nodes_ptr);
186        return NUMA_ERR_NUMA_MEMBIND;
187    }
188
189    /* disable interleaving mode */
190    bitmap_clear_all(numa_alloc_interleave_mask);
191
192    return SYS_ERR_OK;
193}
194
195
196/**
197 * \brief returns the mask of nodes from which memory can currently be allocated.
198 *
199 * \return bitmap of nodes from which can be allocated
200 */
201struct bitmap *numa_get_membind(void)
202{
203    assert(numa_alloc_bind_mask);
204    struct bitmap *im = numa_allocate_nodemask();
205    if (im == NULL) {
206        return NULL;
207    }
208    bitmap_copy(im, numa_alloc_bind_mask);
209    return im;
210}
211
212
213/**
214 * \brief allocates memory on a specific node.
215 *
216 * \param size      size of the region in bytes
217 * \param node      ID of the node to allocate from
218 * \param pagesize  page size to be used for the mapping
219 *
220 * \returns pointer to memory region
221 *
222 * The size argument will be rounded up to a multiple of the system page size.
223 * if the specified node is externally denied to this process, this call will fail.
224 * The memory must be freed with numa_free(). On errors NULL is returned.
225 */
226void *numa_alloc_onnode(size_t size, nodeid_t node, size_t pagesize)
227{
228    errval_t err;
229
230    /*
231     * TODO: keep track of the allocated numa frames
232     */
233
234    NUMA_DEBUG_ALLOC("allocate on node %" PRIuNODEID "\n", node);
235
236    /* validate page size and round up size */
237    vregion_flags_t flags;
238    pagesize = validate_page_size(pagesize, &flags);
239    size = (size + pagesize - 1) & ~(pagesize - 1);
240
241    /* allocate frame */
242    struct capref frame;
243    size_t ret_size;
244    err = numa_frame_alloc_on_node(&frame, size, node, &ret_size);
245    if (err_is_fail(err)) {
246        return NULL;
247    }
248
249    NUMA_DEBUG_ALLOC("mapping allocated frame\n");
250
251    void *addr;
252    err = vspace_map_one_frame_attr_aligned(&addr, size, frame, flags,
253                                            pagesize, NULL, NULL);
254    if (err_is_fail(err)) {
255        USER_PANIC_ERR(err, "vspace_map_one_frame_attr_aligned");
256        err = numa_frame_free(frame);
257        if (err_is_fail(err)) {
258            USER_PANIC_ERR(err, "nested error while freeing frame");
259        }
260        return NULL;
261    }
262
263    NUMA_DEBUG_ALLOC("frame mapped @ %p\n", addr);
264
265    return addr;
266}
267
268
269/**
270 * \brief allocates size bytes of memory on the local node
271 *
272 * \param size  size of the memory region in bytes
273 * \param pagesize  page size to be used for the mapping
274 *
275 * \returns pointer to memory region
276 *
277 * The memory must be freed with numa_free(). On errors NULL is returned.
278 */
279void *numa_alloc_local(size_t size, size_t pagesize)
280{
281    nodeid_t node = numa_current_node();
282
283    NUMA_DEBUG_ALLOC("allocate on local node %" PRIuNODEID "\n", node);
284
285    return numa_alloc_onnode(size, node, pagesize);
286}
287
288
289/**
290 * \brief allocates size bytes of memory page interleaved on all nodes.
291 *
292 * \param size      size of the memory region in bytes
293 * \param pagesize  preferred page size to be used
294 *
295 * \returns pointer to the mapped memory region
296 *
297 * should only be used for large areas consisting of multiple pages.
298 * The memory must be freed with numa_free(). On errors NULL is returned.
299 */
300void *numa_alloc_interleaved(size_t size, size_t pagesize)
301{
302    return numa_alloc_interleaved_subset(size, pagesize, numa_all_nodes_ptr);
303}
304
305
306/**
307 * \brief allocates size bytes of memory page interleaved the nodes specified in
308 *        the nodemask.
309 *
310 * \param size     size of the memory region in bytes
311 * \param nodemask subset of nodes to consider for allocation
312 * \param pagesize  preferred page size to be used
313 *
314 * \returns pointer to the mapped memory region
315 *
316 * should only be used for large areas consisting of multiple pages.
317 * The memory must be freed with numa_free(). On errors NULL is returned.
318 */
319void *numa_alloc_interleaved_subset(size_t size, size_t pagesize,
320                                    struct bitmap *nodemask)
321{
322    errval_t err;
323
324    /* clear out invalid bits */
325    bitmap_clear_range(nodemask, numa_num_configured_nodes(),
326                       bitmap_get_nbits(nodemask));
327
328    /* get the number of nodes */
329    nodeid_t nodes = bitmap_get_weight(nodemask);
330    if (nodes == 0) {
331        return NULL;
332    }
333
334    NUMA_DEBUG_ALLOC("allocating interleaved using %" PRIuNODEID " nodes\n", nodes);
335
336    assert(nodes <= numa_num_configured_nodes());
337
338    vregion_flags_t flags;
339    pagesize = validate_page_size(pagesize, &flags);
340    size_t stride = pagesize;
341
342    size_t node_size = size / nodes;
343    node_size = (node_size + pagesize - 1) & ~(pagesize - 1);
344
345    /* update total size as this may change due to rounding of node sizes*/
346    size = nodes * node_size;
347
348    /*
349     * XXX: we may want to keep track of numa alloced frames
350     */
351
352    struct memobj_numa *memobj = calloc(1, sizeof(struct memobj_numa));
353    err = memobj_create_numa(memobj, size, 0, numa_num_configured_nodes(), stride);
354    if (err_is_fail(err)) {
355        return NULL;
356    }
357
358    bitmap_bit_t node = bitmap_get_first(nodemask);
359    nodeid_t node_idx=0;
360    while(node != BITMAP_BIT_NONE) {
361        struct capref frame;
362        err = numa_frame_alloc_on_node(&frame, node_size, (nodeid_t)node, NULL);
363        if (err_is_fail(err)) {
364            DEBUG_ERR(err, "numa_frame_alloc_on_node");
365            goto out_err;
366        }
367        memobj->m.f.fill(&memobj->m, node_idx, frame, 0);
368        ++node_idx;
369        node = bitmap_get_next(nodemask, node);
370    }
371
372    struct vregion *vreg = calloc(1, sizeof(struct vregion));
373    if (vreg == NULL) {
374        goto out_err;
375    }
376    err = vregion_map_aligned(vreg, get_current_vspace(), &memobj->m, 0, size,
377                        flags, pagesize);
378    if (err_is_fail(err)) {
379        DEBUG_ERR(err, "vregion_map_aligned");
380        goto out_err;
381    }
382
383    err = memobj->m.f.pagefault(&memobj->m, vreg, 0, 0);
384    if (err_is_fail(err)) {
385        vregion_destroy(vreg);
386        free(vreg);
387        DEBUG_ERR(err, "memobj.m.f.pagefault");
388        goto out_err;
389    }
390
391    // XXX - Is this right?
392    return (void *)(uintptr_t)vregion_get_base_addr(vreg);
393
394    out_err:
395    for (int i = 0; i < node_idx; ++i) {
396        struct capref frame;
397        memobj->m.f.unfill(&memobj->m, node_idx, &frame, NULL);
398        cap_delete(frame);
399    }
400    return NULL;
401
402}
403
404
405/**
406 * \brief allocates size bytes of memory with the current NUMA policy.
407 *
408 * \param size      size of the memory region in bytes
409 * \param pagesize  preferred page size to be used
410 * \returns pointer to the mapped memory region
411 *
412 * The memory must be freed with numa_free(). On errors NULL is returned.
413 */
414void *numa_alloc(size_t size, size_t pagesize)
415{
416    NUMA_DEBUG_ALLOC("allocate according to policy\n");
417
418    /* check if we use interleaved mode */
419    if (bitmap_get_weight(numa_alloc_interleave_mask)) {
420        return numa_alloc_interleaved_subset(size, pagesize,
421                                             numa_alloc_interleave_mask);
422    }
423
424    /* check membind */
425    if (bitmap_get_weight(numa_alloc_bind_mask) == 1) {
426        nodeid_t node = (nodeid_t) bitmap_get_first(numa_alloc_bind_mask);
427        return numa_alloc_onnode(size, node, pagesize);
428    }
429
430    /* TODO:
431     * - handle the case where multiple nodes are set in membind
432     */
433
434    /* just return some memory */
435    return malloc(size);
436
437}
438
439
440/**
441 * \brief changes the size of the memory area.
442 *
443 * \param old_addr  pointer ot the old memory region
444 * \param old_size  size of the old memory region
445 * \param new_size  new size to allocate
446 */
447void *numa_realloc(void *old_addr, size_t old_size, size_t new_size)
448{
449    assert(!"NYI");
450    return 0;
451}
452
453
454/**
455 * \brief frees size bytes of memory starting at start
456 *
457 * \param start start of the memory region
458 * \param size  number of bytes to free
459 *
460 * the memory must be previously allocated by one of the numa_alloc* functions
461 */
462void numa_free(void *start, size_t size)
463{
464    assert(!"NYI");
465}
466
467
468
469/**
470 * \brief allocates a frame on a specific node
471 *
472 * \param dest      capref to store the frame
473 * \param size      size of the frame to allocated
474 * \param node      node on which the frame should be allocated
475 * \param ret_size  returned size of the frame capability
476 *
477 * \returns SYS_ERR_OK on SUCCESS
478 *          errval on FAILURE
479 */
480errval_t numa_frame_alloc_on_node(struct capref *dest,
481                                  size_t size,
482                                  nodeid_t node,
483                                  size_t *ret_size)
484{
485    errval_t err;
486
487    NUMA_DEBUG_ALLOC("allocating frame on node %" PRIuNODEID "\n", node);
488
489    uint64_t min_base, max_limit;
490    ram_get_affinity(&min_base, &max_limit);
491
492    if (node >= numa_topology.num_nodes) {
493        return NUMA_ERR_NODEID_INVALID;
494    }
495
496    uint64_t node_base = numa_node_base(node);
497    uint64_t node_limit = node_base + numa_node_size(node, NULL);
498
499    NUMA_DEBUG_ALLOC("setting affinity to 0x%" PRIx64 "..0x%" PRIx64 "\n",
500                     node_base, node_limit);
501
502    ram_set_affinity(node_base, node_limit);
503
504    err = frame_alloc(dest, size, ret_size);
505
506    ram_set_affinity(min_base, max_limit);
507
508    NUMA_DEBUG_ALLOC("restore affinity to 0x%" PRIx64 "..0x%" PRIx64 "\n",
509                     min_base, max_limit);
510
511    return err;
512}
513
514
515/**
516 * \brief frees a previously allocated frame
517 *
518 * \param frame capability to free
519 */
520errval_t numa_frame_free(struct capref frame)
521{
522    assert(!"NYI");
523    return 0;
524}
525
526
527/**
528 * \brief  moves a list of pages in the address space of the current domain
529 *
530 * \param did    the domain ID
531 * \param count  number of pages to move
532 * \param pages  list of pages
533 * \param nodes  list of nodes to which the pages can be moved
534 * \param status returns the outcome for each page
535 * \param flags  flags for moving the pages
536 *
537 * \returns SYS_ERR_OK on SUCCESS
538 */
539errval_t numa_move_pages(domainid_t did,
540                         size_t count,
541                         void **pages,
542                         const nodeid_t *nodes,
543                         errval_t *status,
544                         int flags)
545{
546    assert(!"NYI");
547    return 0;
548}
549
550
551/**
552 * \brief migrate a domain from one set of nodes to another
553 *
554 * \param did        the domain ID
555 * \param fromnodes  bitmap representing the current nodes
556 * \param tonodes    bitmap representing the
557 *
558 * \returns SYS_ERR_OK on SUCCESS
559 */
560errval_t numa_migrate_pages(domainid_t did,
561                            struct bitmap *fromnodes,
562                            struct bitmap *tonodes)
563{
564    assert(!"NYI");
565    return 0;
566}
567