1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * kexec.c - kexec system call core code.
4 * Copyright (C) 2002-2004 Eric Biederman  <ebiederm@xmission.com>
5 */
6
7#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
8
9#include <linux/btf.h>
10#include <linux/capability.h>
11#include <linux/mm.h>
12#include <linux/file.h>
13#include <linux/slab.h>
14#include <linux/fs.h>
15#include <linux/kexec.h>
16#include <linux/mutex.h>
17#include <linux/list.h>
18#include <linux/highmem.h>
19#include <linux/syscalls.h>
20#include <linux/reboot.h>
21#include <linux/ioport.h>
22#include <linux/hardirq.h>
23#include <linux/elf.h>
24#include <linux/elfcore.h>
25#include <linux/utsname.h>
26#include <linux/numa.h>
27#include <linux/suspend.h>
28#include <linux/device.h>
29#include <linux/freezer.h>
30#include <linux/panic_notifier.h>
31#include <linux/pm.h>
32#include <linux/cpu.h>
33#include <linux/uaccess.h>
34#include <linux/io.h>
35#include <linux/console.h>
36#include <linux/vmalloc.h>
37#include <linux/swap.h>
38#include <linux/syscore_ops.h>
39#include <linux/compiler.h>
40#include <linux/hugetlb.h>
41#include <linux/objtool.h>
42#include <linux/kmsg_dump.h>
43
44#include <asm/page.h>
45#include <asm/sections.h>
46
47#include <crypto/hash.h>
48#include "kexec_internal.h"
49
50atomic_t __kexec_lock = ATOMIC_INIT(0);
51
52/* Flag to indicate we are going to kexec a new kernel */
53bool kexec_in_progress = false;
54
55bool kexec_file_dbg_print;
56
57/*
58 * When kexec transitions to the new kernel there is a one-to-one
59 * mapping between physical and virtual addresses.  On processors
60 * where you can disable the MMU this is trivial, and easy.  For
61 * others it is still a simple predictable page table to setup.
62 *
63 * In that environment kexec copies the new kernel to its final
64 * resting place.  This means I can only support memory whose
65 * physical address can fit in an unsigned long.  In particular
66 * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled.
67 * If the assembly stub has more restrictive requirements
68 * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be
69 * defined more restrictively in <asm/kexec.h>.
70 *
71 * The code for the transition from the current kernel to the
72 * new kernel is placed in the control_code_buffer, whose size
73 * is given by KEXEC_CONTROL_PAGE_SIZE.  In the best case only a single
74 * page of memory is necessary, but some architectures require more.
75 * Because this memory must be identity mapped in the transition from
76 * virtual to physical addresses it must live in the range
77 * 0 - TASK_SIZE, as only the user space mappings are arbitrarily
78 * modifiable.
79 *
80 * The assembly stub in the control code buffer is passed a linked list
81 * of descriptor pages detailing the source pages of the new kernel,
82 * and the destination addresses of those source pages.  As this data
83 * structure is not used in the context of the current OS, it must
84 * be self-contained.
85 *
86 * The code has been made to work with highmem pages and will use a
87 * destination page in its final resting place (if it happens
88 * to allocate it).  The end product of this is that most of the
89 * physical address space, and most of RAM can be used.
90 *
91 * Future directions include:
92 *  - allocating a page table with the control code buffer identity
93 *    mapped, to simplify machine_kexec and make kexec_on_panic more
94 *    reliable.
95 */
96
97/*
98 * KIMAGE_NO_DEST is an impossible destination address..., for
99 * allocating pages whose destination address we do not care about.
100 */
101#define KIMAGE_NO_DEST (-1UL)
102#define PAGE_COUNT(x) (((x) + PAGE_SIZE - 1) >> PAGE_SHIFT)
103
104static struct page *kimage_alloc_page(struct kimage *image,
105				       gfp_t gfp_mask,
106				       unsigned long dest);
107
108int sanity_check_segment_list(struct kimage *image)
109{
110	int i;
111	unsigned long nr_segments = image->nr_segments;
112	unsigned long total_pages = 0;
113	unsigned long nr_pages = totalram_pages();
114
115	/*
116	 * Verify we have good destination addresses.  The caller is
117	 * responsible for making certain we don't attempt to load
118	 * the new image into invalid or reserved areas of RAM.  This
119	 * just verifies it is an address we can use.
120	 *
121	 * Since the kernel does everything in page size chunks ensure
122	 * the destination addresses are page aligned.  Too many
123	 * special cases crop of when we don't do this.  The most
124	 * insidious is getting overlapping destination addresses
125	 * simply because addresses are changed to page size
126	 * granularity.
127	 */
128	for (i = 0; i < nr_segments; i++) {
129		unsigned long mstart, mend;
130
131		mstart = image->segment[i].mem;
132		mend   = mstart + image->segment[i].memsz;
133		if (mstart > mend)
134			return -EADDRNOTAVAIL;
135		if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK))
136			return -EADDRNOTAVAIL;
137		if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
138			return -EADDRNOTAVAIL;
139	}
140
141	/* Verify our destination addresses do not overlap.
142	 * If we alloed overlapping destination addresses
143	 * through very weird things can happen with no
144	 * easy explanation as one segment stops on another.
145	 */
146	for (i = 0; i < nr_segments; i++) {
147		unsigned long mstart, mend;
148		unsigned long j;
149
150		mstart = image->segment[i].mem;
151		mend   = mstart + image->segment[i].memsz;
152		for (j = 0; j < i; j++) {
153			unsigned long pstart, pend;
154
155			pstart = image->segment[j].mem;
156			pend   = pstart + image->segment[j].memsz;
157			/* Do the segments overlap ? */
158			if ((mend > pstart) && (mstart < pend))
159				return -EINVAL;
160		}
161	}
162
163	/* Ensure our buffer sizes are strictly less than
164	 * our memory sizes.  This should always be the case,
165	 * and it is easier to check up front than to be surprised
166	 * later on.
167	 */
168	for (i = 0; i < nr_segments; i++) {
169		if (image->segment[i].bufsz > image->segment[i].memsz)
170			return -EINVAL;
171	}
172
173	/*
174	 * Verify that no more than half of memory will be consumed. If the
175	 * request from userspace is too large, a large amount of time will be
176	 * wasted allocating pages, which can cause a soft lockup.
177	 */
178	for (i = 0; i < nr_segments; i++) {
179		if (PAGE_COUNT(image->segment[i].memsz) > nr_pages / 2)
180			return -EINVAL;
181
182		total_pages += PAGE_COUNT(image->segment[i].memsz);
183	}
184
185	if (total_pages > nr_pages / 2)
186		return -EINVAL;
187
188#ifdef CONFIG_CRASH_DUMP
189	/*
190	 * Verify we have good destination addresses.  Normally
191	 * the caller is responsible for making certain we don't
192	 * attempt to load the new image into invalid or reserved
193	 * areas of RAM.  But crash kernels are preloaded into a
194	 * reserved area of ram.  We must ensure the addresses
195	 * are in the reserved area otherwise preloading the
196	 * kernel could corrupt things.
197	 */
198
199	if (image->type == KEXEC_TYPE_CRASH) {
200		for (i = 0; i < nr_segments; i++) {
201			unsigned long mstart, mend;
202
203			mstart = image->segment[i].mem;
204			mend = mstart + image->segment[i].memsz - 1;
205			/* Ensure we are within the crash kernel limits */
206			if ((mstart < phys_to_boot_phys(crashk_res.start)) ||
207			    (mend > phys_to_boot_phys(crashk_res.end)))
208				return -EADDRNOTAVAIL;
209		}
210	}
211#endif
212
213	return 0;
214}
215
216struct kimage *do_kimage_alloc_init(void)
217{
218	struct kimage *image;
219
220	/* Allocate a controlling structure */
221	image = kzalloc(sizeof(*image), GFP_KERNEL);
222	if (!image)
223		return NULL;
224
225	image->head = 0;
226	image->entry = &image->head;
227	image->last_entry = &image->head;
228	image->control_page = ~0; /* By default this does not apply */
229	image->type = KEXEC_TYPE_DEFAULT;
230
231	/* Initialize the list of control pages */
232	INIT_LIST_HEAD(&image->control_pages);
233
234	/* Initialize the list of destination pages */
235	INIT_LIST_HEAD(&image->dest_pages);
236
237	/* Initialize the list of unusable pages */
238	INIT_LIST_HEAD(&image->unusable_pages);
239
240#ifdef CONFIG_CRASH_HOTPLUG
241	image->hp_action = KEXEC_CRASH_HP_NONE;
242	image->elfcorehdr_index = -1;
243	image->elfcorehdr_updated = false;
244#endif
245
246	return image;
247}
248
249int kimage_is_destination_range(struct kimage *image,
250					unsigned long start,
251					unsigned long end)
252{
253	unsigned long i;
254
255	for (i = 0; i < image->nr_segments; i++) {
256		unsigned long mstart, mend;
257
258		mstart = image->segment[i].mem;
259		mend = mstart + image->segment[i].memsz - 1;
260		if ((end >= mstart) && (start <= mend))
261			return 1;
262	}
263
264	return 0;
265}
266
267static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
268{
269	struct page *pages;
270
271	if (fatal_signal_pending(current))
272		return NULL;
273	pages = alloc_pages(gfp_mask & ~__GFP_ZERO, order);
274	if (pages) {
275		unsigned int count, i;
276
277		pages->mapping = NULL;
278		set_page_private(pages, order);
279		count = 1 << order;
280		for (i = 0; i < count; i++)
281			SetPageReserved(pages + i);
282
283		arch_kexec_post_alloc_pages(page_address(pages), count,
284					    gfp_mask);
285
286		if (gfp_mask & __GFP_ZERO)
287			for (i = 0; i < count; i++)
288				clear_highpage(pages + i);
289	}
290
291	return pages;
292}
293
294static void kimage_free_pages(struct page *page)
295{
296	unsigned int order, count, i;
297
298	order = page_private(page);
299	count = 1 << order;
300
301	arch_kexec_pre_free_pages(page_address(page), count);
302
303	for (i = 0; i < count; i++)
304		ClearPageReserved(page + i);
305	__free_pages(page, order);
306}
307
308void kimage_free_page_list(struct list_head *list)
309{
310	struct page *page, *next;
311
312	list_for_each_entry_safe(page, next, list, lru) {
313		list_del(&page->lru);
314		kimage_free_pages(page);
315	}
316}
317
318static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
319							unsigned int order)
320{
321	/* Control pages are special, they are the intermediaries
322	 * that are needed while we copy the rest of the pages
323	 * to their final resting place.  As such they must
324	 * not conflict with either the destination addresses
325	 * or memory the kernel is already using.
326	 *
327	 * The only case where we really need more than one of
328	 * these are for architectures where we cannot disable
329	 * the MMU and must instead generate an identity mapped
330	 * page table for all of the memory.
331	 *
332	 * At worst this runs in O(N) of the image size.
333	 */
334	struct list_head extra_pages;
335	struct page *pages;
336	unsigned int count;
337
338	count = 1 << order;
339	INIT_LIST_HEAD(&extra_pages);
340
341	/* Loop while I can allocate a page and the page allocated
342	 * is a destination page.
343	 */
344	do {
345		unsigned long pfn, epfn, addr, eaddr;
346
347		pages = kimage_alloc_pages(KEXEC_CONTROL_MEMORY_GFP, order);
348		if (!pages)
349			break;
350		pfn   = page_to_boot_pfn(pages);
351		epfn  = pfn + count;
352		addr  = pfn << PAGE_SHIFT;
353		eaddr = (epfn << PAGE_SHIFT) - 1;
354		if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) ||
355			      kimage_is_destination_range(image, addr, eaddr)) {
356			list_add(&pages->lru, &extra_pages);
357			pages = NULL;
358		}
359	} while (!pages);
360
361	if (pages) {
362		/* Remember the allocated page... */
363		list_add(&pages->lru, &image->control_pages);
364
365		/* Because the page is already in it's destination
366		 * location we will never allocate another page at
367		 * that address.  Therefore kimage_alloc_pages
368		 * will not return it (again) and we don't need
369		 * to give it an entry in image->segment[].
370		 */
371	}
372	/* Deal with the destination pages I have inadvertently allocated.
373	 *
374	 * Ideally I would convert multi-page allocations into single
375	 * page allocations, and add everything to image->dest_pages.
376	 *
377	 * For now it is simpler to just free the pages.
378	 */
379	kimage_free_page_list(&extra_pages);
380
381	return pages;
382}
383
384#ifdef CONFIG_CRASH_DUMP
385static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
386						      unsigned int order)
387{
388	/* Control pages are special, they are the intermediaries
389	 * that are needed while we copy the rest of the pages
390	 * to their final resting place.  As such they must
391	 * not conflict with either the destination addresses
392	 * or memory the kernel is already using.
393	 *
394	 * Control pages are also the only pags we must allocate
395	 * when loading a crash kernel.  All of the other pages
396	 * are specified by the segments and we just memcpy
397	 * into them directly.
398	 *
399	 * The only case where we really need more than one of
400	 * these are for architectures where we cannot disable
401	 * the MMU and must instead generate an identity mapped
402	 * page table for all of the memory.
403	 *
404	 * Given the low demand this implements a very simple
405	 * allocator that finds the first hole of the appropriate
406	 * size in the reserved memory region, and allocates all
407	 * of the memory up to and including the hole.
408	 */
409	unsigned long hole_start, hole_end, size;
410	struct page *pages;
411
412	pages = NULL;
413	size = (1 << order) << PAGE_SHIFT;
414	hole_start = ALIGN(image->control_page, size);
415	hole_end   = hole_start + size - 1;
416	while (hole_end <= crashk_res.end) {
417		unsigned long i;
418
419		cond_resched();
420
421		if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT)
422			break;
423		/* See if I overlap any of the segments */
424		for (i = 0; i < image->nr_segments; i++) {
425			unsigned long mstart, mend;
426
427			mstart = image->segment[i].mem;
428			mend   = mstart + image->segment[i].memsz - 1;
429			if ((hole_end >= mstart) && (hole_start <= mend)) {
430				/* Advance the hole to the end of the segment */
431				hole_start = ALIGN(mend, size);
432				hole_end   = hole_start + size - 1;
433				break;
434			}
435		}
436		/* If I don't overlap any segments I have found my hole! */
437		if (i == image->nr_segments) {
438			pages = pfn_to_page(hole_start >> PAGE_SHIFT);
439			image->control_page = hole_end + 1;
440			break;
441		}
442	}
443
444	/* Ensure that these pages are decrypted if SME is enabled. */
445	if (pages)
446		arch_kexec_post_alloc_pages(page_address(pages), 1 << order, 0);
447
448	return pages;
449}
450#endif
451
452
453struct page *kimage_alloc_control_pages(struct kimage *image,
454					 unsigned int order)
455{
456	struct page *pages = NULL;
457
458	switch (image->type) {
459	case KEXEC_TYPE_DEFAULT:
460		pages = kimage_alloc_normal_control_pages(image, order);
461		break;
462#ifdef CONFIG_CRASH_DUMP
463	case KEXEC_TYPE_CRASH:
464		pages = kimage_alloc_crash_control_pages(image, order);
465		break;
466#endif
467	}
468
469	return pages;
470}
471
472static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
473{
474	if (*image->entry != 0)
475		image->entry++;
476
477	if (image->entry == image->last_entry) {
478		kimage_entry_t *ind_page;
479		struct page *page;
480
481		page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST);
482		if (!page)
483			return -ENOMEM;
484
485		ind_page = page_address(page);
486		*image->entry = virt_to_boot_phys(ind_page) | IND_INDIRECTION;
487		image->entry = ind_page;
488		image->last_entry = ind_page +
489				      ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
490	}
491	*image->entry = entry;
492	image->entry++;
493	*image->entry = 0;
494
495	return 0;
496}
497
498static int kimage_set_destination(struct kimage *image,
499				   unsigned long destination)
500{
501	destination &= PAGE_MASK;
502
503	return kimage_add_entry(image, destination | IND_DESTINATION);
504}
505
506
507static int kimage_add_page(struct kimage *image, unsigned long page)
508{
509	page &= PAGE_MASK;
510
511	return kimage_add_entry(image, page | IND_SOURCE);
512}
513
514
515static void kimage_free_extra_pages(struct kimage *image)
516{
517	/* Walk through and free any extra destination pages I may have */
518	kimage_free_page_list(&image->dest_pages);
519
520	/* Walk through and free any unusable pages I have cached */
521	kimage_free_page_list(&image->unusable_pages);
522
523}
524
525void kimage_terminate(struct kimage *image)
526{
527	if (*image->entry != 0)
528		image->entry++;
529
530	*image->entry = IND_DONE;
531}
532
533#define for_each_kimage_entry(image, ptr, entry) \
534	for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
535		ptr = (entry & IND_INDIRECTION) ? \
536			boot_phys_to_virt((entry & PAGE_MASK)) : ptr + 1)
537
538static void kimage_free_entry(kimage_entry_t entry)
539{
540	struct page *page;
541
542	page = boot_pfn_to_page(entry >> PAGE_SHIFT);
543	kimage_free_pages(page);
544}
545
546void kimage_free(struct kimage *image)
547{
548	kimage_entry_t *ptr, entry;
549	kimage_entry_t ind = 0;
550
551	if (!image)
552		return;
553
554#ifdef CONFIG_CRASH_DUMP
555	if (image->vmcoreinfo_data_copy) {
556		crash_update_vmcoreinfo_safecopy(NULL);
557		vunmap(image->vmcoreinfo_data_copy);
558	}
559#endif
560
561	kimage_free_extra_pages(image);
562	for_each_kimage_entry(image, ptr, entry) {
563		if (entry & IND_INDIRECTION) {
564			/* Free the previous indirection page */
565			if (ind & IND_INDIRECTION)
566				kimage_free_entry(ind);
567			/* Save this indirection page until we are
568			 * done with it.
569			 */
570			ind = entry;
571		} else if (entry & IND_SOURCE)
572			kimage_free_entry(entry);
573	}
574	/* Free the final indirection page */
575	if (ind & IND_INDIRECTION)
576		kimage_free_entry(ind);
577
578	/* Handle any machine specific cleanup */
579	machine_kexec_cleanup(image);
580
581	/* Free the kexec control pages... */
582	kimage_free_page_list(&image->control_pages);
583
584	/*
585	 * Free up any temporary buffers allocated. This might hit if
586	 * error occurred much later after buffer allocation.
587	 */
588	if (image->file_mode)
589		kimage_file_post_load_cleanup(image);
590
591	kfree(image);
592}
593
594static kimage_entry_t *kimage_dst_used(struct kimage *image,
595					unsigned long page)
596{
597	kimage_entry_t *ptr, entry;
598	unsigned long destination = 0;
599
600	for_each_kimage_entry(image, ptr, entry) {
601		if (entry & IND_DESTINATION)
602			destination = entry & PAGE_MASK;
603		else if (entry & IND_SOURCE) {
604			if (page == destination)
605				return ptr;
606			destination += PAGE_SIZE;
607		}
608	}
609
610	return NULL;
611}
612
613static struct page *kimage_alloc_page(struct kimage *image,
614					gfp_t gfp_mask,
615					unsigned long destination)
616{
617	/*
618	 * Here we implement safeguards to ensure that a source page
619	 * is not copied to its destination page before the data on
620	 * the destination page is no longer useful.
621	 *
622	 * To do this we maintain the invariant that a source page is
623	 * either its own destination page, or it is not a
624	 * destination page at all.
625	 *
626	 * That is slightly stronger than required, but the proof
627	 * that no problems will not occur is trivial, and the
628	 * implementation is simply to verify.
629	 *
630	 * When allocating all pages normally this algorithm will run
631	 * in O(N) time, but in the worst case it will run in O(N^2)
632	 * time.   If the runtime is a problem the data structures can
633	 * be fixed.
634	 */
635	struct page *page;
636	unsigned long addr;
637
638	/*
639	 * Walk through the list of destination pages, and see if I
640	 * have a match.
641	 */
642	list_for_each_entry(page, &image->dest_pages, lru) {
643		addr = page_to_boot_pfn(page) << PAGE_SHIFT;
644		if (addr == destination) {
645			list_del(&page->lru);
646			return page;
647		}
648	}
649	page = NULL;
650	while (1) {
651		kimage_entry_t *old;
652
653		/* Allocate a page, if we run out of memory give up */
654		page = kimage_alloc_pages(gfp_mask, 0);
655		if (!page)
656			return NULL;
657		/* If the page cannot be used file it away */
658		if (page_to_boot_pfn(page) >
659				(KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
660			list_add(&page->lru, &image->unusable_pages);
661			continue;
662		}
663		addr = page_to_boot_pfn(page) << PAGE_SHIFT;
664
665		/* If it is the destination page we want use it */
666		if (addr == destination)
667			break;
668
669		/* If the page is not a destination page use it */
670		if (!kimage_is_destination_range(image, addr,
671						  addr + PAGE_SIZE - 1))
672			break;
673
674		/*
675		 * I know that the page is someones destination page.
676		 * See if there is already a source page for this
677		 * destination page.  And if so swap the source pages.
678		 */
679		old = kimage_dst_used(image, addr);
680		if (old) {
681			/* If so move it */
682			unsigned long old_addr;
683			struct page *old_page;
684
685			old_addr = *old & PAGE_MASK;
686			old_page = boot_pfn_to_page(old_addr >> PAGE_SHIFT);
687			copy_highpage(page, old_page);
688			*old = addr | (*old & ~PAGE_MASK);
689
690			/* The old page I have found cannot be a
691			 * destination page, so return it if it's
692			 * gfp_flags honor the ones passed in.
693			 */
694			if (!(gfp_mask & __GFP_HIGHMEM) &&
695			    PageHighMem(old_page)) {
696				kimage_free_pages(old_page);
697				continue;
698			}
699			page = old_page;
700			break;
701		}
702		/* Place the page on the destination list, to be used later */
703		list_add(&page->lru, &image->dest_pages);
704	}
705
706	return page;
707}
708
709static int kimage_load_normal_segment(struct kimage *image,
710					 struct kexec_segment *segment)
711{
712	unsigned long maddr;
713	size_t ubytes, mbytes;
714	int result;
715	unsigned char __user *buf = NULL;
716	unsigned char *kbuf = NULL;
717
718	if (image->file_mode)
719		kbuf = segment->kbuf;
720	else
721		buf = segment->buf;
722	ubytes = segment->bufsz;
723	mbytes = segment->memsz;
724	maddr = segment->mem;
725
726	result = kimage_set_destination(image, maddr);
727	if (result < 0)
728		goto out;
729
730	while (mbytes) {
731		struct page *page;
732		char *ptr;
733		size_t uchunk, mchunk;
734
735		page = kimage_alloc_page(image, GFP_HIGHUSER, maddr);
736		if (!page) {
737			result  = -ENOMEM;
738			goto out;
739		}
740		result = kimage_add_page(image, page_to_boot_pfn(page)
741								<< PAGE_SHIFT);
742		if (result < 0)
743			goto out;
744
745		ptr = kmap_local_page(page);
746		/* Start with a clear page */
747		clear_page(ptr);
748		ptr += maddr & ~PAGE_MASK;
749		mchunk = min_t(size_t, mbytes,
750				PAGE_SIZE - (maddr & ~PAGE_MASK));
751		uchunk = min(ubytes, mchunk);
752
753		if (uchunk) {
754			/* For file based kexec, source pages are in kernel memory */
755			if (image->file_mode)
756				memcpy(ptr, kbuf, uchunk);
757			else
758				result = copy_from_user(ptr, buf, uchunk);
759			ubytes -= uchunk;
760			if (image->file_mode)
761				kbuf += uchunk;
762			else
763				buf += uchunk;
764		}
765		kunmap_local(ptr);
766		if (result) {
767			result = -EFAULT;
768			goto out;
769		}
770		maddr  += mchunk;
771		mbytes -= mchunk;
772
773		cond_resched();
774	}
775out:
776	return result;
777}
778
779#ifdef CONFIG_CRASH_DUMP
780static int kimage_load_crash_segment(struct kimage *image,
781					struct kexec_segment *segment)
782{
783	/* For crash dumps kernels we simply copy the data from
784	 * user space to it's destination.
785	 * We do things a page at a time for the sake of kmap.
786	 */
787	unsigned long maddr;
788	size_t ubytes, mbytes;
789	int result;
790	unsigned char __user *buf = NULL;
791	unsigned char *kbuf = NULL;
792
793	result = 0;
794	if (image->file_mode)
795		kbuf = segment->kbuf;
796	else
797		buf = segment->buf;
798	ubytes = segment->bufsz;
799	mbytes = segment->memsz;
800	maddr = segment->mem;
801	while (mbytes) {
802		struct page *page;
803		char *ptr;
804		size_t uchunk, mchunk;
805
806		page = boot_pfn_to_page(maddr >> PAGE_SHIFT);
807		if (!page) {
808			result  = -ENOMEM;
809			goto out;
810		}
811		arch_kexec_post_alloc_pages(page_address(page), 1, 0);
812		ptr = kmap_local_page(page);
813		ptr += maddr & ~PAGE_MASK;
814		mchunk = min_t(size_t, mbytes,
815				PAGE_SIZE - (maddr & ~PAGE_MASK));
816		uchunk = min(ubytes, mchunk);
817		if (mchunk > uchunk) {
818			/* Zero the trailing part of the page */
819			memset(ptr + uchunk, 0, mchunk - uchunk);
820		}
821
822		if (uchunk) {
823			/* For file based kexec, source pages are in kernel memory */
824			if (image->file_mode)
825				memcpy(ptr, kbuf, uchunk);
826			else
827				result = copy_from_user(ptr, buf, uchunk);
828			ubytes -= uchunk;
829			if (image->file_mode)
830				kbuf += uchunk;
831			else
832				buf += uchunk;
833		}
834		kexec_flush_icache_page(page);
835		kunmap_local(ptr);
836		arch_kexec_pre_free_pages(page_address(page), 1);
837		if (result) {
838			result = -EFAULT;
839			goto out;
840		}
841		maddr  += mchunk;
842		mbytes -= mchunk;
843
844		cond_resched();
845	}
846out:
847	return result;
848}
849#endif
850
851int kimage_load_segment(struct kimage *image,
852				struct kexec_segment *segment)
853{
854	int result = -ENOMEM;
855
856	switch (image->type) {
857	case KEXEC_TYPE_DEFAULT:
858		result = kimage_load_normal_segment(image, segment);
859		break;
860#ifdef CONFIG_CRASH_DUMP
861	case KEXEC_TYPE_CRASH:
862		result = kimage_load_crash_segment(image, segment);
863		break;
864#endif
865	}
866
867	return result;
868}
869
870struct kexec_load_limit {
871	/* Mutex protects the limit count. */
872	struct mutex mutex;
873	int limit;
874};
875
876static struct kexec_load_limit load_limit_reboot = {
877	.mutex = __MUTEX_INITIALIZER(load_limit_reboot.mutex),
878	.limit = -1,
879};
880
881static struct kexec_load_limit load_limit_panic = {
882	.mutex = __MUTEX_INITIALIZER(load_limit_panic.mutex),
883	.limit = -1,
884};
885
886struct kimage *kexec_image;
887struct kimage *kexec_crash_image;
888static int kexec_load_disabled;
889
890#ifdef CONFIG_SYSCTL
891static int kexec_limit_handler(struct ctl_table *table, int write,
892			       void *buffer, size_t *lenp, loff_t *ppos)
893{
894	struct kexec_load_limit *limit = table->data;
895	int val;
896	struct ctl_table tmp = {
897		.data = &val,
898		.maxlen = sizeof(val),
899		.mode = table->mode,
900	};
901	int ret;
902
903	if (write) {
904		ret = proc_dointvec(&tmp, write, buffer, lenp, ppos);
905		if (ret)
906			return ret;
907
908		if (val < 0)
909			return -EINVAL;
910
911		mutex_lock(&limit->mutex);
912		if (limit->limit != -1 && val >= limit->limit)
913			ret = -EINVAL;
914		else
915			limit->limit = val;
916		mutex_unlock(&limit->mutex);
917
918		return ret;
919	}
920
921	mutex_lock(&limit->mutex);
922	val = limit->limit;
923	mutex_unlock(&limit->mutex);
924
925	return proc_dointvec(&tmp, write, buffer, lenp, ppos);
926}
927
928static struct ctl_table kexec_core_sysctls[] = {
929	{
930		.procname	= "kexec_load_disabled",
931		.data		= &kexec_load_disabled,
932		.maxlen		= sizeof(int),
933		.mode		= 0644,
934		/* only handle a transition from default "0" to "1" */
935		.proc_handler	= proc_dointvec_minmax,
936		.extra1		= SYSCTL_ONE,
937		.extra2		= SYSCTL_ONE,
938	},
939	{
940		.procname	= "kexec_load_limit_panic",
941		.data		= &load_limit_panic,
942		.mode		= 0644,
943		.proc_handler	= kexec_limit_handler,
944	},
945	{
946		.procname	= "kexec_load_limit_reboot",
947		.data		= &load_limit_reboot,
948		.mode		= 0644,
949		.proc_handler	= kexec_limit_handler,
950	},
951	{ }
952};
953
954static int __init kexec_core_sysctl_init(void)
955{
956	register_sysctl_init("kernel", kexec_core_sysctls);
957	return 0;
958}
959late_initcall(kexec_core_sysctl_init);
960#endif
961
962bool kexec_load_permitted(int kexec_image_type)
963{
964	struct kexec_load_limit *limit;
965
966	/*
967	 * Only the superuser can use the kexec syscall and if it has not
968	 * been disabled.
969	 */
970	if (!capable(CAP_SYS_BOOT) || kexec_load_disabled)
971		return false;
972
973	/* Check limit counter and decrease it.*/
974	limit = (kexec_image_type == KEXEC_TYPE_CRASH) ?
975		&load_limit_panic : &load_limit_reboot;
976	mutex_lock(&limit->mutex);
977	if (!limit->limit) {
978		mutex_unlock(&limit->mutex);
979		return false;
980	}
981	if (limit->limit != -1)
982		limit->limit--;
983	mutex_unlock(&limit->mutex);
984
985	return true;
986}
987
988/*
989 * Move into place and start executing a preloaded standalone
990 * executable.  If nothing was preloaded return an error.
991 */
992int kernel_kexec(void)
993{
994	int error = 0;
995
996	if (!kexec_trylock())
997		return -EBUSY;
998	if (!kexec_image) {
999		error = -EINVAL;
1000		goto Unlock;
1001	}
1002
1003#ifdef CONFIG_KEXEC_JUMP
1004	if (kexec_image->preserve_context) {
1005		pm_prepare_console();
1006		error = freeze_processes();
1007		if (error) {
1008			error = -EBUSY;
1009			goto Restore_console;
1010		}
1011		suspend_console();
1012		error = dpm_suspend_start(PMSG_FREEZE);
1013		if (error)
1014			goto Resume_console;
1015		/* At this point, dpm_suspend_start() has been called,
1016		 * but *not* dpm_suspend_end(). We *must* call
1017		 * dpm_suspend_end() now.  Otherwise, drivers for
1018		 * some devices (e.g. interrupt controllers) become
1019		 * desynchronized with the actual state of the
1020		 * hardware at resume time, and evil weirdness ensues.
1021		 */
1022		error = dpm_suspend_end(PMSG_FREEZE);
1023		if (error)
1024			goto Resume_devices;
1025		error = suspend_disable_secondary_cpus();
1026		if (error)
1027			goto Enable_cpus;
1028		local_irq_disable();
1029		error = syscore_suspend();
1030		if (error)
1031			goto Enable_irqs;
1032	} else
1033#endif
1034	{
1035		kexec_in_progress = true;
1036		kernel_restart_prepare("kexec reboot");
1037		migrate_to_reboot_cpu();
1038		syscore_shutdown();
1039
1040		/*
1041		 * migrate_to_reboot_cpu() disables CPU hotplug assuming that
1042		 * no further code needs to use CPU hotplug (which is true in
1043		 * the reboot case). However, the kexec path depends on using
1044		 * CPU hotplug again; so re-enable it here.
1045		 */
1046		cpu_hotplug_enable();
1047		pr_notice("Starting new kernel\n");
1048		machine_shutdown();
1049	}
1050
1051	kmsg_dump(KMSG_DUMP_SHUTDOWN);
1052	machine_kexec(kexec_image);
1053
1054#ifdef CONFIG_KEXEC_JUMP
1055	if (kexec_image->preserve_context) {
1056		syscore_resume();
1057 Enable_irqs:
1058		local_irq_enable();
1059 Enable_cpus:
1060		suspend_enable_secondary_cpus();
1061		dpm_resume_start(PMSG_RESTORE);
1062 Resume_devices:
1063		dpm_resume_end(PMSG_RESTORE);
1064 Resume_console:
1065		resume_console();
1066		thaw_processes();
1067 Restore_console:
1068		pm_restore_console();
1069	}
1070#endif
1071
1072 Unlock:
1073	kexec_unlock();
1074	return error;
1075}
1076