1/** 2 * \file 3 * \brief General Numa functions 4 * 5 */ 6 7/* 8 * Copyright (c) 2014, ETH Zurich. 9 * All rights reserved. 10 * 11 * This file is distributed under the terms in the attached LICENSE file. 12 * If you do not find this file, copies can be found by writing to: 13 * ETH Zurich D-INFK, Universitaetstr. 6, CH-8092 Zurich. Attn: Systems Group. 14 */ 15 16#include <stdio.h> 17#include <string.h> 18 19#include <barrelfish/barrelfish.h> 20 21#include <numa.h> 22#include <bitmap.h> 23#include "numa_internal.h" 24 25///< numa interleave mask for allocations 26struct bitmap *numa_alloc_interleave_mask; 27 28///< numa bind mask for allocations 29struct bitmap *numa_alloc_bind_mask; 30 31/** 32 * \brief validates the given page size and sets the flags 33 * 34 * \param pagesize desired page size 35 * \param flags returns the VREGION_FLAGS_* 36 * 37 * \returns activated page size 38 * 39 * if the page size is not known or not supported, BASE_PAGE_SIZE is returned 40 */ 41static size_t validate_page_size(size_t pagesize, vregion_flags_t *flags) 42{ 43#if defined(__x86_64__) || defined(__aarch64__) 44 /* use huge, large or base pages on 64 bits */ 45 switch(pagesize) { 46 case LARGE_PAGE_SIZE: 47 *flags = VREGION_FLAGS_READ_WRITE | VREGION_FLAGS_LARGE; 48 return LARGE_PAGE_SIZE; 49 case HUGE_PAGE_SIZE: 50 *flags = VREGION_FLAGS_READ_WRITE | VREGION_FLAGS_HUGE; 51 return HUGE_PAGE_SIZE; 52 case BASE_PAGE_SIZE: 53 default: 54 *flags = VREGION_FLAGS_READ_WRITE; 55 return BASE_PAGE_SIZE; 56 } 57#elif defined(__x86_32__) || (defined(__arm__) && !defined(__aarch64__)) 58 /* use base or large pages on 32bits */ 59 switch(pagesize) { 60 case LARGE_PAGE_SIZE: 61 *flags = VREGION_FLAGS_READ_WRITE | VREGION_FLAGS_LARGE; 62 return LARGE_PAGE_SIZE; 63 case BASE_PAGE_SIZE: 64 default: 65 *flags = VREGION_FLAGS_READ_WRITE; 66 return BASE_PAGE_SIZE; 67 } 68#else 69 /* falling back to base page sizes uf unknown architecture */ 70 *flags = VREGION_FLAGS_READ_WRITE; 71 return BASE_PAGE_SIZE; 72#endif 73} 74 75 76/** \brief returns the current interleave mask 77 * 78 * \returns bitmask representing the current interleave state 79 * 80 * returns the current interleave mask if the task's memory allocation policy is 81 * page interleaved. Otherwise, this function returns an empty mask. 82 */ 83struct bitmap *numa_get_interleave_mask(void) 84{ 85 assert(numa_alloc_interleave_mask); 86 struct bitmap *im = numa_allocate_nodemask(); 87 if (im == NULL) { 88 return NULL; 89 } 90 bitmap_copy(im, numa_alloc_interleave_mask); 91 return im; 92} 93 94 95/** 96 * \brief sets the memory interleave mask for the current task to nodemask 97 * 98 * \param nodemask bitmask representing the nodes 99 * 100 * All new memory allocations are page interleaved over all nodes in the interleave 101 * mask. Interleaving can be turned off again by passing an empty mask. 102 * 103 * This bitmask is considered to be a hint. Fallback to other nodes may be possible 104 */ 105void numa_set_interleave_mask(struct bitmap *nodemask) 106{ 107 assert(numa_alloc_interleave_mask); 108 109 if (!nodemask) { 110 bitmap_clear_all(numa_alloc_interleave_mask); 111 return; 112 } 113 114 if (bitmap_get_nbits(nodemask) < NUMA_MAX_NUMNODES) { 115 NUMA_WARNING("supplied interleave mask (%p) has to less bits!", nodemask); 116 return; 117 } 118 bitmap_copy(numa_alloc_interleave_mask, nodemask); 119 120 /* clear out the invalid nodes */ 121 bitmap_clear_range(numa_alloc_interleave_mask, numa_num_configured_nodes(), 122 bitmap_get_nbits(numa_alloc_interleave_mask)); 123 124 /* clear the bind mask as we are using interleaving mode now */ 125 bitmap_clear_all(numa_alloc_bind_mask); 126} 127 128 129/** 130 * \brief binds the current task and its children to the nodes specified in nodemask. 131 * 132 * \param nodemask bitmap representing the nodes 133 */ 134void numa_bind(struct bitmap *nodemask) 135{ 136 USER_PANIC("Not yet implemented"); 137} 138 139 140/** 141 * \brief sets the memory allocation policy for the calling task to local allocation. 142 */ 143void numa_set_localalloc(void) 144{ 145 assert(numa_alloc_bind_mask); 146 assert(numa_alloc_interleave_mask); 147 148 /* clear interleave mode */ 149 bitmap_clear_all(numa_alloc_interleave_mask); 150 151 bitmap_clear_all(numa_alloc_bind_mask); 152 bitmap_set_bit(numa_alloc_bind_mask, numa_current_node()); 153} 154 155/** 156 * \brief sets the memory allocation mask. 157 * 158 * \param nodemask bitmap representing the nodes 159 * 160 * The task will only allocate memory from the nodes set in nodemask. 161 * 162 * an empty mask or not allowed nodes in the mask will result in an error 163 */ 164errval_t numa_set_membind(struct bitmap *nodemask) 165{ 166 assert(numa_alloc_bind_mask); 167 assert(numa_alloc_interleave_mask); 168 169 if (!nodemask) { 170 return NUMA_ERR_BITMAP_PARSE; 171 } 172 173 if (bitmap_get_nbits(nodemask) < NUMA_MAX_NUMNODES) { 174 NUMA_WARNING("supplied interleave mask (%p) has to less bits!", nodemask); 175 return NUMA_ERR_BITMAP_RANGE; 176 } 177 178 /* copy new membind mask and clear out invalid bits */ 179 bitmap_copy(numa_alloc_bind_mask, nodemask); 180 bitmap_clear_range(numa_alloc_bind_mask, numa_num_configured_nodes(), 181 bitmap_get_nbits(numa_alloc_bind_mask)); 182 183 if (bitmap_get_weight(numa_alloc_bind_mask) == 0) { 184 /* cannot bind to no node, restore with all nodes pointer*/ 185 bitmap_copy(numa_alloc_bind_mask, numa_all_nodes_ptr); 186 return NUMA_ERR_NUMA_MEMBIND; 187 } 188 189 /* disable interleaving mode */ 190 bitmap_clear_all(numa_alloc_interleave_mask); 191 192 return SYS_ERR_OK; 193} 194 195 196/** 197 * \brief returns the mask of nodes from which memory can currently be allocated. 198 * 199 * \return bitmap of nodes from which can be allocated 200 */ 201struct bitmap *numa_get_membind(void) 202{ 203 assert(numa_alloc_bind_mask); 204 struct bitmap *im = numa_allocate_nodemask(); 205 if (im == NULL) { 206 return NULL; 207 } 208 bitmap_copy(im, numa_alloc_bind_mask); 209 return im; 210} 211 212 213/** 214 * \brief allocates memory on a specific node. 215 * 216 * \param size size of the region in bytes 217 * \param node ID of the node to allocate from 218 * \param pagesize page size to be used for the mapping 219 * 220 * \returns pointer to memory region 221 * 222 * The size argument will be rounded up to a multiple of the system page size. 223 * if the specified node is externally denied to this process, this call will fail. 224 * The memory must be freed with numa_free(). On errors NULL is returned. 225 */ 226void *numa_alloc_onnode(size_t size, nodeid_t node, size_t pagesize) 227{ 228 errval_t err; 229 230 /* 231 * TODO: keep track of the allocated numa frames 232 */ 233 234 NUMA_DEBUG_ALLOC("allocate on node %" PRIuNODEID "\n", node); 235 236 /* validate page size and round up size */ 237 vregion_flags_t flags; 238 pagesize = validate_page_size(pagesize, &flags); 239 size = (size + pagesize - 1) & ~(pagesize - 1); 240 241 /* allocate frame */ 242 struct capref frame; 243 size_t ret_size; 244 err = numa_frame_alloc_on_node(&frame, size, node, &ret_size); 245 if (err_is_fail(err)) { 246 return NULL; 247 } 248 249 NUMA_DEBUG_ALLOC("mapping allocated frame\n"); 250 251 void *addr; 252 err = vspace_map_one_frame_attr_aligned(&addr, size, frame, flags, 253 pagesize, NULL, NULL); 254 if (err_is_fail(err)) { 255 USER_PANIC_ERR(err, "vspace_map_one_frame_attr_aligned"); 256 err = numa_frame_free(frame); 257 if (err_is_fail(err)) { 258 USER_PANIC_ERR(err, "nested error while freeing frame"); 259 } 260 return NULL; 261 } 262 263 NUMA_DEBUG_ALLOC("frame mapped @ %p\n", addr); 264 265 return addr; 266} 267 268 269/** 270 * \brief allocates size bytes of memory on the local node 271 * 272 * \param size size of the memory region in bytes 273 * \param pagesize page size to be used for the mapping 274 * 275 * \returns pointer to memory region 276 * 277 * The memory must be freed with numa_free(). On errors NULL is returned. 278 */ 279void *numa_alloc_local(size_t size, size_t pagesize) 280{ 281 nodeid_t node = numa_current_node(); 282 283 NUMA_DEBUG_ALLOC("allocate on local node %" PRIuNODEID "\n", node); 284 285 return numa_alloc_onnode(size, node, pagesize); 286} 287 288 289/** 290 * \brief allocates size bytes of memory page interleaved on all nodes. 291 * 292 * \param size size of the memory region in bytes 293 * \param pagesize preferred page size to be used 294 * 295 * \returns pointer to the mapped memory region 296 * 297 * should only be used for large areas consisting of multiple pages. 298 * The memory must be freed with numa_free(). On errors NULL is returned. 299 */ 300void *numa_alloc_interleaved(size_t size, size_t pagesize) 301{ 302 return numa_alloc_interleaved_subset(size, pagesize, numa_all_nodes_ptr); 303} 304 305 306/** 307 * \brief allocates size bytes of memory page interleaved the nodes specified in 308 * the nodemask. 309 * 310 * \param size size of the memory region in bytes 311 * \param nodemask subset of nodes to consider for allocation 312 * \param pagesize preferred page size to be used 313 * 314 * \returns pointer to the mapped memory region 315 * 316 * should only be used for large areas consisting of multiple pages. 317 * The memory must be freed with numa_free(). On errors NULL is returned. 318 */ 319void *numa_alloc_interleaved_subset(size_t size, size_t pagesize, 320 struct bitmap *nodemask) 321{ 322 errval_t err; 323 324 /* clear out invalid bits */ 325 bitmap_clear_range(nodemask, numa_num_configured_nodes(), 326 bitmap_get_nbits(nodemask)); 327 328 /* get the number of nodes */ 329 nodeid_t nodes = bitmap_get_weight(nodemask); 330 if (nodes == 0) { 331 return NULL; 332 } 333 334 NUMA_DEBUG_ALLOC("allocating interleaved using %" PRIuNODEID " nodes\n", nodes); 335 336 assert(nodes <= numa_num_configured_nodes()); 337 338 vregion_flags_t flags; 339 pagesize = validate_page_size(pagesize, &flags); 340 size_t stride = pagesize; 341 342 size_t node_size = size / nodes; 343 node_size = (node_size + pagesize - 1) & ~(pagesize - 1); 344 345 /* update total size as this may change due to rounding of node sizes*/ 346 size = nodes * node_size; 347 348 /* 349 * XXX: we may want to keep track of numa alloced frames 350 */ 351 352 struct memobj_numa *memobj = calloc(1, sizeof(struct memobj_numa)); 353 err = memobj_create_numa(memobj, size, 0, numa_num_configured_nodes(), stride); 354 if (err_is_fail(err)) { 355 return NULL; 356 } 357 358 bitmap_bit_t node = bitmap_get_first(nodemask); 359 nodeid_t node_idx=0; 360 while(node != BITMAP_BIT_NONE) { 361 struct capref frame; 362 err = numa_frame_alloc_on_node(&frame, node_size, (nodeid_t)node, NULL); 363 if (err_is_fail(err)) { 364 DEBUG_ERR(err, "numa_frame_alloc_on_node"); 365 goto out_err; 366 } 367 memobj->m.f.fill(&memobj->m, node_idx, frame, 0); 368 ++node_idx; 369 node = bitmap_get_next(nodemask, node); 370 } 371 372 struct vregion *vreg = calloc(1, sizeof(struct vregion)); 373 if (vreg == NULL) { 374 goto out_err; 375 } 376 err = vregion_map_aligned(vreg, get_current_vspace(), &memobj->m, 0, size, 377 flags, pagesize); 378 if (err_is_fail(err)) { 379 DEBUG_ERR(err, "vregion_map_aligned"); 380 goto out_err; 381 } 382 383 err = memobj->m.f.pagefault(&memobj->m, vreg, 0, 0); 384 if (err_is_fail(err)) { 385 vregion_destroy(vreg); 386 free(vreg); 387 DEBUG_ERR(err, "memobj.m.f.pagefault"); 388 goto out_err; 389 } 390 391 // XXX - Is this right? 392 return (void *)(uintptr_t)vregion_get_base_addr(vreg); 393 394 out_err: 395 for (int i = 0; i < node_idx; ++i) { 396 struct capref frame; 397 memobj->m.f.unfill(&memobj->m, node_idx, &frame, NULL); 398 cap_delete(frame); 399 } 400 return NULL; 401 402} 403 404 405/** 406 * \brief allocates size bytes of memory with the current NUMA policy. 407 * 408 * \param size size of the memory region in bytes 409 * \param pagesize preferred page size to be used 410 * \returns pointer to the mapped memory region 411 * 412 * The memory must be freed with numa_free(). On errors NULL is returned. 413 */ 414void *numa_alloc(size_t size, size_t pagesize) 415{ 416 NUMA_DEBUG_ALLOC("allocate according to policy\n"); 417 418 /* check if we use interleaved mode */ 419 if (bitmap_get_weight(numa_alloc_interleave_mask)) { 420 return numa_alloc_interleaved_subset(size, pagesize, 421 numa_alloc_interleave_mask); 422 } 423 424 /* check membind */ 425 if (bitmap_get_weight(numa_alloc_bind_mask) == 1) { 426 nodeid_t node = (nodeid_t) bitmap_get_first(numa_alloc_bind_mask); 427 return numa_alloc_onnode(size, node, pagesize); 428 } 429 430 /* TODO: 431 * - handle the case where multiple nodes are set in membind 432 */ 433 434 /* just return some memory */ 435 return malloc(size); 436 437} 438 439 440/** 441 * \brief changes the size of the memory area. 442 * 443 * \param old_addr pointer ot the old memory region 444 * \param old_size size of the old memory region 445 * \param new_size new size to allocate 446 */ 447void *numa_realloc(void *old_addr, size_t old_size, size_t new_size) 448{ 449 assert(!"NYI"); 450 return 0; 451} 452 453 454/** 455 * \brief frees size bytes of memory starting at start 456 * 457 * \param start start of the memory region 458 * \param size number of bytes to free 459 * 460 * the memory must be previously allocated by one of the numa_alloc* functions 461 */ 462void numa_free(void *start, size_t size) 463{ 464 assert(!"NYI"); 465} 466 467 468 469/** 470 * \brief allocates a frame on a specific node 471 * 472 * \param dest capref to store the frame 473 * \param size size of the frame to allocated 474 * \param node node on which the frame should be allocated 475 * \param ret_size returned size of the frame capability 476 * 477 * \returns SYS_ERR_OK on SUCCESS 478 * errval on FAILURE 479 */ 480errval_t numa_frame_alloc_on_node(struct capref *dest, 481 size_t size, 482 nodeid_t node, 483 size_t *ret_size) 484{ 485 errval_t err; 486 487 NUMA_DEBUG_ALLOC("allocating frame on node %" PRIuNODEID "\n", node); 488 489 uint64_t min_base, max_limit; 490 ram_get_affinity(&min_base, &max_limit); 491 492 if (node >= numa_topology.num_nodes) { 493 return NUMA_ERR_NODEID_INVALID; 494 } 495 496 uint64_t node_base = numa_node_base(node); 497 uint64_t node_limit = node_base + numa_node_size(node, NULL); 498 499 NUMA_DEBUG_ALLOC("setting affinity to 0x%" PRIx64 "..0x%" PRIx64 "\n", 500 node_base, node_limit); 501 502 ram_set_affinity(node_base, node_limit); 503 504 err = frame_alloc(dest, size, ret_size); 505 506 ram_set_affinity(min_base, max_limit); 507 508 NUMA_DEBUG_ALLOC("restore affinity to 0x%" PRIx64 "..0x%" PRIx64 "\n", 509 min_base, max_limit); 510 511 return err; 512} 513 514 515/** 516 * \brief frees a previously allocated frame 517 * 518 * \param frame capability to free 519 */ 520errval_t numa_frame_free(struct capref frame) 521{ 522 assert(!"NYI"); 523 return 0; 524} 525 526 527/** 528 * \brief moves a list of pages in the address space of the current domain 529 * 530 * \param did the domain ID 531 * \param count number of pages to move 532 * \param pages list of pages 533 * \param nodes list of nodes to which the pages can be moved 534 * \param status returns the outcome for each page 535 * \param flags flags for moving the pages 536 * 537 * \returns SYS_ERR_OK on SUCCESS 538 */ 539errval_t numa_move_pages(domainid_t did, 540 size_t count, 541 void **pages, 542 const nodeid_t *nodes, 543 errval_t *status, 544 int flags) 545{ 546 assert(!"NYI"); 547 return 0; 548} 549 550 551/** 552 * \brief migrate a domain from one set of nodes to another 553 * 554 * \param did the domain ID 555 * \param fromnodes bitmap representing the current nodes 556 * \param tonodes bitmap representing the 557 * 558 * \returns SYS_ERR_OK on SUCCESS 559 */ 560errval_t numa_migrate_pages(domainid_t did, 561 struct bitmap *fromnodes, 562 struct bitmap *tonodes) 563{ 564 assert(!"NYI"); 565 return 0; 566} 567