1// SPDX-License-Identifier: GPL-2.0
2#include <linux/mm.h>
3#include <linux/mmzone.h>
4#include <linux/page_reporting.h>
5#include <linux/gfp.h>
6#include <linux/export.h>
7#include <linux/module.h>
8#include <linux/delay.h>
9#include <linux/scatterlist.h>
10
11#include "page_reporting.h"
12#include "internal.h"
13
14/* Initialize to an unsupported value */
15unsigned int page_reporting_order = -1;
16
17static int page_order_update_notify(const char *val, const struct kernel_param *kp)
18{
19	/*
20	 * If param is set beyond this limit, order is set to default
21	 * pageblock_order value
22	 */
23	return  param_set_uint_minmax(val, kp, 0, MAX_PAGE_ORDER);
24}
25
26static const struct kernel_param_ops page_reporting_param_ops = {
27	.set = &page_order_update_notify,
28	/*
29	 * For the get op, use param_get_int instead of param_get_uint.
30	 * This is to make sure that when unset the initialized value of
31	 * -1 is shown correctly
32	 */
33	.get = &param_get_int,
34};
35
36module_param_cb(page_reporting_order, &page_reporting_param_ops,
37			&page_reporting_order, 0644);
38MODULE_PARM_DESC(page_reporting_order, "Set page reporting order");
39
40/*
41 * This symbol is also a kernel parameter. Export the page_reporting_order
42 * symbol so that other drivers can access it to control order values without
43 * having to introduce another configurable parameter. Only one driver can
44 * register with the page_reporting driver for the service, so we have just
45 * one control parameter for the use case(which can be accessed in both
46 * drivers)
47 */
48EXPORT_SYMBOL_GPL(page_reporting_order);
49
50#define PAGE_REPORTING_DELAY	(2 * HZ)
51static struct page_reporting_dev_info __rcu *pr_dev_info __read_mostly;
52
53enum {
54	PAGE_REPORTING_IDLE = 0,
55	PAGE_REPORTING_REQUESTED,
56	PAGE_REPORTING_ACTIVE
57};
58
59/* request page reporting */
60static void
61__page_reporting_request(struct page_reporting_dev_info *prdev)
62{
63	unsigned int state;
64
65	/* Check to see if we are in desired state */
66	state = atomic_read(&prdev->state);
67	if (state == PAGE_REPORTING_REQUESTED)
68		return;
69
70	/*
71	 * If reporting is already active there is nothing we need to do.
72	 * Test against 0 as that represents PAGE_REPORTING_IDLE.
73	 */
74	state = atomic_xchg(&prdev->state, PAGE_REPORTING_REQUESTED);
75	if (state != PAGE_REPORTING_IDLE)
76		return;
77
78	/*
79	 * Delay the start of work to allow a sizable queue to build. For
80	 * now we are limiting this to running no more than once every
81	 * couple of seconds.
82	 */
83	schedule_delayed_work(&prdev->work, PAGE_REPORTING_DELAY);
84}
85
86/* notify prdev of free page reporting request */
87void __page_reporting_notify(void)
88{
89	struct page_reporting_dev_info *prdev;
90
91	/*
92	 * We use RCU to protect the pr_dev_info pointer. In almost all
93	 * cases this should be present, however in the unlikely case of
94	 * a shutdown this will be NULL and we should exit.
95	 */
96	rcu_read_lock();
97	prdev = rcu_dereference(pr_dev_info);
98	if (likely(prdev))
99		__page_reporting_request(prdev);
100
101	rcu_read_unlock();
102}
103
104static void
105page_reporting_drain(struct page_reporting_dev_info *prdev,
106		     struct scatterlist *sgl, unsigned int nents, bool reported)
107{
108	struct scatterlist *sg = sgl;
109
110	/*
111	 * Drain the now reported pages back into their respective
112	 * free lists/areas. We assume at least one page is populated.
113	 */
114	do {
115		struct page *page = sg_page(sg);
116		int mt = get_pageblock_migratetype(page);
117		unsigned int order = get_order(sg->length);
118
119		__putback_isolated_page(page, order, mt);
120
121		/* If the pages were not reported due to error skip flagging */
122		if (!reported)
123			continue;
124
125		/*
126		 * If page was not comingled with another page we can
127		 * consider the result to be "reported" since the page
128		 * hasn't been modified, otherwise we will need to
129		 * report on the new larger page when we make our way
130		 * up to that higher order.
131		 */
132		if (PageBuddy(page) && buddy_order(page) == order)
133			__SetPageReported(page);
134	} while ((sg = sg_next(sg)));
135
136	/* reinitialize scatterlist now that it is empty */
137	sg_init_table(sgl, nents);
138}
139
140/*
141 * The page reporting cycle consists of 4 stages, fill, report, drain, and
142 * idle. We will cycle through the first 3 stages until we cannot obtain a
143 * full scatterlist of pages, in that case we will switch to idle.
144 */
145static int
146page_reporting_cycle(struct page_reporting_dev_info *prdev, struct zone *zone,
147		     unsigned int order, unsigned int mt,
148		     struct scatterlist *sgl, unsigned int *offset)
149{
150	struct free_area *area = &zone->free_area[order];
151	struct list_head *list = &area->free_list[mt];
152	unsigned int page_len = PAGE_SIZE << order;
153	struct page *page, *next;
154	long budget;
155	int err = 0;
156
157	/*
158	 * Perform early check, if free area is empty there is
159	 * nothing to process so we can skip this free_list.
160	 */
161	if (list_empty(list))
162		return err;
163
164	spin_lock_irq(&zone->lock);
165
166	/*
167	 * Limit how many calls we will be making to the page reporting
168	 * device for this list. By doing this we avoid processing any
169	 * given list for too long.
170	 *
171	 * The current value used allows us enough calls to process over a
172	 * sixteenth of the current list plus one additional call to handle
173	 * any pages that may have already been present from the previous
174	 * list processed. This should result in us reporting all pages on
175	 * an idle system in about 30 seconds.
176	 *
177	 * The division here should be cheap since PAGE_REPORTING_CAPACITY
178	 * should always be a power of 2.
179	 */
180	budget = DIV_ROUND_UP(area->nr_free, PAGE_REPORTING_CAPACITY * 16);
181
182	/* loop through free list adding unreported pages to sg list */
183	list_for_each_entry_safe(page, next, list, lru) {
184		/* We are going to skip over the reported pages. */
185		if (PageReported(page))
186			continue;
187
188		/*
189		 * If we fully consumed our budget then update our
190		 * state to indicate that we are requesting additional
191		 * processing and exit this list.
192		 */
193		if (budget < 0) {
194			atomic_set(&prdev->state, PAGE_REPORTING_REQUESTED);
195			next = page;
196			break;
197		}
198
199		/* Attempt to pull page from list and place in scatterlist */
200		if (*offset) {
201			if (!__isolate_free_page(page, order)) {
202				next = page;
203				break;
204			}
205
206			/* Add page to scatter list */
207			--(*offset);
208			sg_set_page(&sgl[*offset], page, page_len, 0);
209
210			continue;
211		}
212
213		/*
214		 * Make the first non-reported page in the free list
215		 * the new head of the free list before we release the
216		 * zone lock.
217		 */
218		if (!list_is_first(&page->lru, list))
219			list_rotate_to_front(&page->lru, list);
220
221		/* release lock before waiting on report processing */
222		spin_unlock_irq(&zone->lock);
223
224		/* begin processing pages in local list */
225		err = prdev->report(prdev, sgl, PAGE_REPORTING_CAPACITY);
226
227		/* reset offset since the full list was reported */
228		*offset = PAGE_REPORTING_CAPACITY;
229
230		/* update budget to reflect call to report function */
231		budget--;
232
233		/* reacquire zone lock and resume processing */
234		spin_lock_irq(&zone->lock);
235
236		/* flush reported pages from the sg list */
237		page_reporting_drain(prdev, sgl, PAGE_REPORTING_CAPACITY, !err);
238
239		/*
240		 * Reset next to first entry, the old next isn't valid
241		 * since we dropped the lock to report the pages
242		 */
243		next = list_first_entry(list, struct page, lru);
244
245		/* exit on error */
246		if (err)
247			break;
248	}
249
250	/* Rotate any leftover pages to the head of the freelist */
251	if (!list_entry_is_head(next, list, lru) && !list_is_first(&next->lru, list))
252		list_rotate_to_front(&next->lru, list);
253
254	spin_unlock_irq(&zone->lock);
255
256	return err;
257}
258
259static int
260page_reporting_process_zone(struct page_reporting_dev_info *prdev,
261			    struct scatterlist *sgl, struct zone *zone)
262{
263	unsigned int order, mt, leftover, offset = PAGE_REPORTING_CAPACITY;
264	unsigned long watermark;
265	int err = 0;
266
267	/* Generate minimum watermark to be able to guarantee progress */
268	watermark = low_wmark_pages(zone) +
269		    (PAGE_REPORTING_CAPACITY << page_reporting_order);
270
271	/*
272	 * Cancel request if insufficient free memory or if we failed
273	 * to allocate page reporting statistics for the zone.
274	 */
275	if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA))
276		return err;
277
278	/* Process each free list starting from lowest order/mt */
279	for (order = page_reporting_order; order < NR_PAGE_ORDERS; order++) {
280		for (mt = 0; mt < MIGRATE_TYPES; mt++) {
281			/* We do not pull pages from the isolate free list */
282			if (is_migrate_isolate(mt))
283				continue;
284
285			err = page_reporting_cycle(prdev, zone, order, mt,
286						   sgl, &offset);
287			if (err)
288				return err;
289		}
290	}
291
292	/* report the leftover pages before going idle */
293	leftover = PAGE_REPORTING_CAPACITY - offset;
294	if (leftover) {
295		sgl = &sgl[offset];
296		err = prdev->report(prdev, sgl, leftover);
297
298		/* flush any remaining pages out from the last report */
299		spin_lock_irq(&zone->lock);
300		page_reporting_drain(prdev, sgl, leftover, !err);
301		spin_unlock_irq(&zone->lock);
302	}
303
304	return err;
305}
306
307static void page_reporting_process(struct work_struct *work)
308{
309	struct delayed_work *d_work = to_delayed_work(work);
310	struct page_reporting_dev_info *prdev =
311		container_of(d_work, struct page_reporting_dev_info, work);
312	int err = 0, state = PAGE_REPORTING_ACTIVE;
313	struct scatterlist *sgl;
314	struct zone *zone;
315
316	/*
317	 * Change the state to "Active" so that we can track if there is
318	 * anyone requests page reporting after we complete our pass. If
319	 * the state is not altered by the end of the pass we will switch
320	 * to idle and quit scheduling reporting runs.
321	 */
322	atomic_set(&prdev->state, state);
323
324	/* allocate scatterlist to store pages being reported on */
325	sgl = kmalloc_array(PAGE_REPORTING_CAPACITY, sizeof(*sgl), GFP_KERNEL);
326	if (!sgl)
327		goto err_out;
328
329	sg_init_table(sgl, PAGE_REPORTING_CAPACITY);
330
331	for_each_zone(zone) {
332		err = page_reporting_process_zone(prdev, sgl, zone);
333		if (err)
334			break;
335	}
336
337	kfree(sgl);
338err_out:
339	/*
340	 * If the state has reverted back to requested then there may be
341	 * additional pages to be processed. We will defer for 2s to allow
342	 * more pages to accumulate.
343	 */
344	state = atomic_cmpxchg(&prdev->state, state, PAGE_REPORTING_IDLE);
345	if (state == PAGE_REPORTING_REQUESTED)
346		schedule_delayed_work(&prdev->work, PAGE_REPORTING_DELAY);
347}
348
349static DEFINE_MUTEX(page_reporting_mutex);
350DEFINE_STATIC_KEY_FALSE(page_reporting_enabled);
351
352int page_reporting_register(struct page_reporting_dev_info *prdev)
353{
354	int err = 0;
355
356	mutex_lock(&page_reporting_mutex);
357
358	/* nothing to do if already in use */
359	if (rcu_dereference_protected(pr_dev_info,
360				lockdep_is_held(&page_reporting_mutex))) {
361		err = -EBUSY;
362		goto err_out;
363	}
364
365	/*
366	 * If the page_reporting_order value is not set, we check if
367	 * an order is provided from the driver that is performing the
368	 * registration. If that is not provided either, we default to
369	 * pageblock_order.
370	 */
371
372	if (page_reporting_order == -1) {
373		if (prdev->order > 0 && prdev->order <= MAX_PAGE_ORDER)
374			page_reporting_order = prdev->order;
375		else
376			page_reporting_order = pageblock_order;
377	}
378
379	/* initialize state and work structures */
380	atomic_set(&prdev->state, PAGE_REPORTING_IDLE);
381	INIT_DELAYED_WORK(&prdev->work, &page_reporting_process);
382
383	/* Begin initial flush of zones */
384	__page_reporting_request(prdev);
385
386	/* Assign device to allow notifications */
387	rcu_assign_pointer(pr_dev_info, prdev);
388
389	/* enable page reporting notification */
390	if (!static_key_enabled(&page_reporting_enabled)) {
391		static_branch_enable(&page_reporting_enabled);
392		pr_info("Free page reporting enabled\n");
393	}
394err_out:
395	mutex_unlock(&page_reporting_mutex);
396
397	return err;
398}
399EXPORT_SYMBOL_GPL(page_reporting_register);
400
401void page_reporting_unregister(struct page_reporting_dev_info *prdev)
402{
403	mutex_lock(&page_reporting_mutex);
404
405	if (prdev == rcu_dereference_protected(pr_dev_info,
406				lockdep_is_held(&page_reporting_mutex))) {
407		/* Disable page reporting notification */
408		RCU_INIT_POINTER(pr_dev_info, NULL);
409		synchronize_rcu();
410
411		/* Flush any existing work, and lock it out */
412		cancel_delayed_work_sync(&prdev->work);
413	}
414
415	mutex_unlock(&page_reporting_mutex);
416}
417EXPORT_SYMBOL_GPL(page_reporting_unregister);
418