1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * Copyright 2023 Red Hat
4 */
5
6#include <linux/delay.h>
7#include <linux/mm.h>
8#include <linux/sched/mm.h>
9#include <linux/slab.h>
10#include <linux/vmalloc.h>
11
12#include "logger.h"
13#include "memory-alloc.h"
14#include "permassert.h"
15
16/*
17 * UDS and VDO keep track of which threads are allowed to allocate memory freely, and which threads
18 * must be careful to not do a memory allocation that does an I/O request. The 'allocating_threads'
19 * thread_registry and its associated methods implement this tracking.
20 */
21static struct thread_registry allocating_threads;
22
23static inline bool allocations_allowed(void)
24{
25	return vdo_lookup_thread(&allocating_threads) != NULL;
26}
27
28/*
29 * Register the current thread as an allocating thread.
30 *
31 * An optional flag location can be supplied indicating whether, at any given point in time, the
32 * threads associated with that flag should be allocating storage. If the flag is false, a message
33 * will be logged.
34 *
35 * If no flag is supplied, the thread is always allowed to allocate storage without complaint.
36 *
37 * @new_thread: registered_thread structure to use for the current thread
38 * @flag_ptr: Location of the allocation-allowed flag
39 */
40void vdo_register_allocating_thread(struct registered_thread *new_thread,
41				    const bool *flag_ptr)
42{
43	if (flag_ptr == NULL) {
44		static const bool allocation_always_allowed = true;
45
46		flag_ptr = &allocation_always_allowed;
47	}
48
49	vdo_register_thread(&allocating_threads, new_thread, flag_ptr);
50}
51
52/* Unregister the current thread as an allocating thread. */
53void vdo_unregister_allocating_thread(void)
54{
55	vdo_unregister_thread(&allocating_threads);
56}
57
58/*
59 * We track how much memory has been allocated and freed. When we unload the module, we log an
60 * error if we have not freed all the memory that we allocated. Nearly all memory allocation and
61 * freeing is done using this module.
62 *
63 * We do not use kernel functions like the kvasprintf() method, which allocate memory indirectly
64 * using kmalloc.
65 *
66 * These data structures and methods are used to track the amount of memory used.
67 */
68
69/*
70 * We allocate very few large objects, and allocation/deallocation isn't done in a
71 * performance-critical stage for us, so a linked list should be fine.
72 */
73struct vmalloc_block_info {
74	void *ptr;
75	size_t size;
76	struct vmalloc_block_info *next;
77};
78
79static struct {
80	spinlock_t lock;
81	size_t kmalloc_blocks;
82	size_t kmalloc_bytes;
83	size_t vmalloc_blocks;
84	size_t vmalloc_bytes;
85	size_t peak_bytes;
86	struct vmalloc_block_info *vmalloc_list;
87} memory_stats __cacheline_aligned;
88
89static void update_peak_usage(void)
90{
91	size_t total_bytes = memory_stats.kmalloc_bytes + memory_stats.vmalloc_bytes;
92
93	if (total_bytes > memory_stats.peak_bytes)
94		memory_stats.peak_bytes = total_bytes;
95}
96
97static void add_kmalloc_block(size_t size)
98{
99	unsigned long flags;
100
101	spin_lock_irqsave(&memory_stats.lock, flags);
102	memory_stats.kmalloc_blocks++;
103	memory_stats.kmalloc_bytes += size;
104	update_peak_usage();
105	spin_unlock_irqrestore(&memory_stats.lock, flags);
106}
107
108static void remove_kmalloc_block(size_t size)
109{
110	unsigned long flags;
111
112	spin_lock_irqsave(&memory_stats.lock, flags);
113	memory_stats.kmalloc_blocks--;
114	memory_stats.kmalloc_bytes -= size;
115	spin_unlock_irqrestore(&memory_stats.lock, flags);
116}
117
118static void add_vmalloc_block(struct vmalloc_block_info *block)
119{
120	unsigned long flags;
121
122	spin_lock_irqsave(&memory_stats.lock, flags);
123	block->next = memory_stats.vmalloc_list;
124	memory_stats.vmalloc_list = block;
125	memory_stats.vmalloc_blocks++;
126	memory_stats.vmalloc_bytes += block->size;
127	update_peak_usage();
128	spin_unlock_irqrestore(&memory_stats.lock, flags);
129}
130
131static void remove_vmalloc_block(void *ptr)
132{
133	struct vmalloc_block_info *block;
134	struct vmalloc_block_info **block_ptr;
135	unsigned long flags;
136
137	spin_lock_irqsave(&memory_stats.lock, flags);
138	for (block_ptr = &memory_stats.vmalloc_list;
139	     (block = *block_ptr) != NULL;
140	     block_ptr = &block->next) {
141		if (block->ptr == ptr) {
142			*block_ptr = block->next;
143			memory_stats.vmalloc_blocks--;
144			memory_stats.vmalloc_bytes -= block->size;
145			break;
146		}
147	}
148
149	spin_unlock_irqrestore(&memory_stats.lock, flags);
150	if (block != NULL)
151		vdo_free(block);
152	else
153		vdo_log_info("attempting to remove ptr %px not found in vmalloc list", ptr);
154}
155
156/*
157 * Determine whether allocating a memory block should use kmalloc or __vmalloc.
158 *
159 * vmalloc can allocate any integral number of pages.
160 *
161 * kmalloc can allocate any number of bytes up to a configured limit, which defaults to 8 megabytes
162 * on some systems. kmalloc is especially good when memory is being both allocated and freed, and
163 * it does this efficiently in a multi CPU environment.
164 *
165 * kmalloc usually rounds the size of the block up to the next power of two, so when the requested
166 * block is bigger than PAGE_SIZE / 2 bytes, kmalloc will never give you less space than the
167 * corresponding vmalloc allocation. Sometimes vmalloc will use less overhead than kmalloc.
168 *
169 * The advantages of kmalloc do not help out UDS or VDO, because we allocate all our memory up
170 * front and do not free and reallocate it. Sometimes we have problems using kmalloc, because the
171 * Linux memory page map can become so fragmented that kmalloc will not give us a 32KB chunk. We
172 * have used vmalloc as a backup to kmalloc in the past, and a follow-up vmalloc of 32KB will work.
173 * But there is no strong case to be made for using kmalloc over vmalloc for these size chunks.
174 *
175 * The kmalloc/vmalloc boundary is set at 4KB, and kmalloc gets the 4KB requests. There is no
176 * strong reason for favoring either kmalloc or vmalloc for 4KB requests, except that tracking
177 * vmalloc statistics uses a linked list implementation. Using a simple test, this choice of
178 * boundary results in 132 vmalloc calls. Using vmalloc for requests of exactly 4KB results in an
179 * additional 6374 vmalloc calls, which is much less efficient for tracking.
180 *
181 * @size: How many bytes to allocate
182 */
183static inline bool use_kmalloc(size_t size)
184{
185	return size <= PAGE_SIZE;
186}
187
188/*
189 * Allocate storage based on memory size and alignment, logging an error if the allocation fails.
190 * The memory will be zeroed.
191 *
192 * @size: The size of an object
193 * @align: The required alignment
194 * @what: What is being allocated (for error logging)
195 * @ptr: A pointer to hold the allocated memory
196 *
197 * Return: VDO_SUCCESS or an error code
198 */
199int vdo_allocate_memory(size_t size, size_t align, const char *what, void *ptr)
200{
201	/*
202	 * The __GFP_RETRY_MAYFAIL flag means the VM implementation will retry memory reclaim
203	 * procedures that have previously failed if there is some indication that progress has
204	 * been made elsewhere. It can wait for other tasks to attempt high level approaches to
205	 * freeing memory such as compaction (which removes fragmentation) and page-out. There is
206	 * still a definite limit to the number of retries, but it is a larger limit than with
207	 * __GFP_NORETRY. Allocations with this flag may fail, but only when there is genuinely
208	 * little unused memory. While these allocations do not directly trigger the OOM killer,
209	 * their failure indicates that the system is likely to need to use the OOM killer soon.
210	 * The caller must handle failure, but can reasonably do so by failing a higher-level
211	 * request, or completing it only in a much less efficient manner.
212	 */
213	const gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_RETRY_MAYFAIL;
214	unsigned int noio_flags;
215	bool allocations_restricted = !allocations_allowed();
216	unsigned long start_time;
217	void *p = NULL;
218
219	if (unlikely(ptr == NULL))
220		return -EINVAL;
221
222	if (size == 0) {
223		*((void **) ptr) = NULL;
224		return VDO_SUCCESS;
225	}
226
227	if (allocations_restricted)
228		noio_flags = memalloc_noio_save();
229
230	start_time = jiffies;
231	if (use_kmalloc(size) && (align < PAGE_SIZE)) {
232		p = kmalloc(size, gfp_flags | __GFP_NOWARN);
233		if (p == NULL) {
234			/*
235			 * It is possible for kmalloc to fail to allocate memory because there is
236			 * no page available. A short sleep may allow the page reclaimer to
237			 * free a page.
238			 */
239			fsleep(1000);
240			p = kmalloc(size, gfp_flags);
241		}
242
243		if (p != NULL)
244			add_kmalloc_block(ksize(p));
245	} else {
246		struct vmalloc_block_info *block;
247
248		if (vdo_allocate(1, struct vmalloc_block_info, __func__, &block) == VDO_SUCCESS) {
249			/*
250			 * It is possible for __vmalloc to fail to allocate memory because there
251			 * are no pages available. A short sleep may allow the page reclaimer
252			 * to free enough pages for a small allocation.
253			 *
254			 * For larger allocations, the page_alloc code is racing against the page
255			 * reclaimer. If the page reclaimer can stay ahead of page_alloc, the
256			 * __vmalloc will succeed. But if page_alloc overtakes the page reclaimer,
257			 * the allocation fails. It is possible that more retries will succeed.
258			 */
259			for (;;) {
260				p = __vmalloc(size, gfp_flags | __GFP_NOWARN);
261				if (p != NULL)
262					break;
263
264				if (jiffies_to_msecs(jiffies - start_time) > 1000) {
265					/* Try one more time, logging a failure for this call. */
266					p = __vmalloc(size, gfp_flags);
267					break;
268				}
269
270				fsleep(1000);
271			}
272
273			if (p == NULL) {
274				vdo_free(block);
275			} else {
276				block->ptr = p;
277				block->size = PAGE_ALIGN(size);
278				add_vmalloc_block(block);
279			}
280		}
281	}
282
283	if (allocations_restricted)
284		memalloc_noio_restore(noio_flags);
285
286	if (unlikely(p == NULL)) {
287		vdo_log_error("Could not allocate %zu bytes for %s in %u msecs",
288			      size, what, jiffies_to_msecs(jiffies - start_time));
289		return -ENOMEM;
290	}
291
292	*((void **) ptr) = p;
293	return VDO_SUCCESS;
294}
295
296/*
297 * Allocate storage based on memory size, failing immediately if the required memory is not
298 * available. The memory will be zeroed.
299 *
300 * @size: The size of an object.
301 * @what: What is being allocated (for error logging)
302 *
303 * Return: pointer to the allocated memory, or NULL if the required space is not available.
304 */
305void *vdo_allocate_memory_nowait(size_t size, const char *what __maybe_unused)
306{
307	void *p = kmalloc(size, GFP_NOWAIT | __GFP_ZERO);
308
309	if (p != NULL)
310		add_kmalloc_block(ksize(p));
311
312	return p;
313}
314
315void vdo_free(void *ptr)
316{
317	if (ptr != NULL) {
318		if (is_vmalloc_addr(ptr)) {
319			remove_vmalloc_block(ptr);
320			vfree(ptr);
321		} else {
322			remove_kmalloc_block(ksize(ptr));
323			kfree(ptr);
324		}
325	}
326}
327
328/*
329 * Reallocate dynamically allocated memory. There are no alignment guarantees for the reallocated
330 * memory. If the new memory is larger than the old memory, the new space will be zeroed.
331 *
332 * @ptr: The memory to reallocate.
333 * @old_size: The old size of the memory
334 * @size: The new size to allocate
335 * @what: What is being allocated (for error logging)
336 * @new_ptr: A pointer to hold the reallocated pointer
337 *
338 * Return: VDO_SUCCESS or an error code
339 */
340int vdo_reallocate_memory(void *ptr, size_t old_size, size_t size, const char *what,
341			  void *new_ptr)
342{
343	int result;
344
345	if (size == 0) {
346		vdo_free(ptr);
347		*(void **) new_ptr = NULL;
348		return VDO_SUCCESS;
349	}
350
351	result = vdo_allocate(size, char, what, new_ptr);
352	if (result != VDO_SUCCESS)
353		return result;
354
355	if (ptr != NULL) {
356		if (old_size < size)
357			size = old_size;
358
359		memcpy(*((void **) new_ptr), ptr, size);
360		vdo_free(ptr);
361	}
362
363	return VDO_SUCCESS;
364}
365
366int vdo_duplicate_string(const char *string, const char *what, char **new_string)
367{
368	int result;
369	u8 *dup;
370
371	result = vdo_allocate(strlen(string) + 1, u8, what, &dup);
372	if (result != VDO_SUCCESS)
373		return result;
374
375	memcpy(dup, string, strlen(string) + 1);
376	*new_string = dup;
377	return VDO_SUCCESS;
378}
379
380void vdo_memory_init(void)
381{
382	spin_lock_init(&memory_stats.lock);
383	vdo_initialize_thread_registry(&allocating_threads);
384}
385
386void vdo_memory_exit(void)
387{
388	VDO_ASSERT_LOG_ONLY(memory_stats.kmalloc_bytes == 0,
389			    "kmalloc memory used (%zd bytes in %zd blocks) is returned to the kernel",
390			    memory_stats.kmalloc_bytes, memory_stats.kmalloc_blocks);
391	VDO_ASSERT_LOG_ONLY(memory_stats.vmalloc_bytes == 0,
392			    "vmalloc memory used (%zd bytes in %zd blocks) is returned to the kernel",
393			    memory_stats.vmalloc_bytes, memory_stats.vmalloc_blocks);
394	vdo_log_debug("peak usage %zd bytes", memory_stats.peak_bytes);
395}
396
397void vdo_get_memory_stats(u64 *bytes_used, u64 *peak_bytes_used)
398{
399	unsigned long flags;
400
401	spin_lock_irqsave(&memory_stats.lock, flags);
402	*bytes_used = memory_stats.kmalloc_bytes + memory_stats.vmalloc_bytes;
403	*peak_bytes_used = memory_stats.peak_bytes;
404	spin_unlock_irqrestore(&memory_stats.lock, flags);
405}
406
407/*
408 * Report stats on any allocated memory that we're tracking. Not all allocation types are
409 * guaranteed to be tracked in bytes (e.g., bios).
410 */
411void vdo_report_memory_usage(void)
412{
413	unsigned long flags;
414	u64 kmalloc_blocks;
415	u64 kmalloc_bytes;
416	u64 vmalloc_blocks;
417	u64 vmalloc_bytes;
418	u64 peak_usage;
419	u64 total_bytes;
420
421	spin_lock_irqsave(&memory_stats.lock, flags);
422	kmalloc_blocks = memory_stats.kmalloc_blocks;
423	kmalloc_bytes = memory_stats.kmalloc_bytes;
424	vmalloc_blocks = memory_stats.vmalloc_blocks;
425	vmalloc_bytes = memory_stats.vmalloc_bytes;
426	peak_usage = memory_stats.peak_bytes;
427	spin_unlock_irqrestore(&memory_stats.lock, flags);
428	total_bytes = kmalloc_bytes + vmalloc_bytes;
429	vdo_log_info("current module memory tracking (actual allocation sizes, not requested):");
430	vdo_log_info("  %llu bytes in %llu kmalloc blocks",
431		     (unsigned long long) kmalloc_bytes,
432		     (unsigned long long) kmalloc_blocks);
433	vdo_log_info("  %llu bytes in %llu vmalloc blocks",
434		     (unsigned long long) vmalloc_bytes,
435		     (unsigned long long) vmalloc_blocks);
436	vdo_log_info("  total %llu bytes, peak usage %llu bytes",
437		     (unsigned long long) total_bytes, (unsigned long long) peak_usage);
438}
439