1/*
2 * High-level sync()-related operations
3 */
4
5#include <linux/kernel.h>
6#include <linux/file.h>
7#include <linux/fs.h>
8#include <linux/slab.h>
9#include <linux/module.h>
10#include <linux/sched.h>
11#include <linux/writeback.h>
12#include <linux/syscalls.h>
13#include <linux/linkage.h>
14#include <linux/pagemap.h>
15#include <linux/quotaops.h>
16#include <linux/buffer_head.h>
17#include <linux/backing-dev.h>
18#include "internal.h"
19
20#define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \
21			SYNC_FILE_RANGE_WAIT_AFTER)
22
23/*
24 * Do the filesystem syncing work. For simple filesystems
25 * writeback_inodes_sb(sb) just dirties buffers with inodes so we have to
26 * submit IO for these buffers via __sync_blockdev(). This also speeds up the
27 * wait == 1 case since in that case write_inode() functions do
28 * sync_dirty_buffer() and thus effectively write one block at a time.
29 */
30static int __sync_filesystem(struct super_block *sb, int wait)
31{
32	/*
33	 * This should be safe, as we require bdi backing to actually
34	 * write out data in the first place
35	 */
36	if (!sb->s_bdi || sb->s_bdi == &noop_backing_dev_info)
37		return 0;
38
39	if (sb->s_qcop && sb->s_qcop->quota_sync)
40		sb->s_qcop->quota_sync(sb, -1, wait);
41
42	if (wait)
43		sync_inodes_sb(sb);
44	else
45		writeback_inodes_sb(sb);
46
47	if (sb->s_op->sync_fs)
48		sb->s_op->sync_fs(sb, wait);
49	return __sync_blockdev(sb->s_bdev, wait);
50}
51
52/*
53 * Write out and wait upon all dirty data associated with this
54 * superblock.  Filesystem data as well as the underlying block
55 * device.  Takes the superblock lock.
56 */
57int sync_filesystem(struct super_block *sb)
58{
59	int ret;
60
61	/*
62	 * We need to be protected against the filesystem going from
63	 * r/o to r/w or vice versa.
64	 */
65	WARN_ON(!rwsem_is_locked(&sb->s_umount));
66
67	/*
68	 * No point in syncing out anything if the filesystem is read-only.
69	 */
70	if (sb->s_flags & MS_RDONLY)
71		return 0;
72
73	ret = __sync_filesystem(sb, 0);
74	if (ret < 0)
75		return ret;
76	return __sync_filesystem(sb, 1);
77}
78EXPORT_SYMBOL_GPL(sync_filesystem);
79
80static void sync_one_sb(struct super_block *sb, void *arg)
81{
82	if (!(sb->s_flags & MS_RDONLY) && sb->s_bdi)
83		__sync_filesystem(sb, *(int *)arg);
84}
85/*
86 * Sync all the data for all the filesystems (called by sys_sync() and
87 * emergency sync)
88 */
89static void sync_filesystems(int wait)
90{
91	iterate_supers(sync_one_sb, &wait);
92}
93
94/*
95 * sync everything.  Start out by waking pdflush, because that writes back
96 * all queues in parallel.
97 */
98SYSCALL_DEFINE0(sync)
99{
100	wakeup_flusher_threads(0);
101	sync_filesystems(0);
102	sync_filesystems(1);
103	if (unlikely(laptop_mode))
104		laptop_sync_completion();
105	return 0;
106}
107
108static void do_sync_work(struct work_struct *work)
109{
110	/*
111	 * Sync twice to reduce the possibility we skipped some inodes / pages
112	 * because they were temporarily locked
113	 */
114#ifdef CONFIG_DUMP_PREV_OOPS_MSG
115	enable_oopsbuf(1);
116#endif
117	sync_filesystems(0);
118	sync_filesystems(0);
119	printk("Emergency Sync complete\n");
120	kfree(work);
121}
122
123void emergency_sync(void)
124{
125	struct work_struct *work;
126
127	work = kmalloc(sizeof(*work), GFP_ATOMIC);
128	if (work) {
129		INIT_WORK(work, do_sync_work);
130		schedule_work(work);
131	}
132}
133
134/**
135 * vfs_fsync_range - helper to sync a range of data & metadata to disk
136 * @file:		file to sync
137 * @start:		offset in bytes of the beginning of data range to sync
138 * @end:		offset in bytes of the end of data range (inclusive)
139 * @datasync:		perform only datasync
140 *
141 * Write back data in range @start..@end and metadata for @file to disk.  If
142 * @datasync is set only metadata needed to access modified file data is
143 * written.
144 */
145int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync)
146{
147	struct address_space *mapping = file->f_mapping;
148	int err, ret;
149
150	if (!file->f_op || !file->f_op->fsync) {
151		ret = -EINVAL;
152		goto out;
153	}
154
155	ret = filemap_write_and_wait_range(mapping, start, end);
156
157	/*
158	 * We need to protect against concurrent writers, which could cause
159	 * livelocks in fsync_buffers_list().
160	 */
161	mutex_lock(&mapping->host->i_mutex);
162	err = file->f_op->fsync(file, datasync);
163	if (!ret)
164		ret = err;
165	mutex_unlock(&mapping->host->i_mutex);
166
167out:
168	return ret;
169}
170EXPORT_SYMBOL(vfs_fsync_range);
171
172/**
173 * vfs_fsync - perform a fsync or fdatasync on a file
174 * @file:		file to sync
175 * @datasync:		only perform a fdatasync operation
176 *
177 * Write back data and metadata for @file to disk.  If @datasync is
178 * set only metadata needed to access modified file data is written.
179 */
180int vfs_fsync(struct file *file, int datasync)
181{
182	return vfs_fsync_range(file, 0, LLONG_MAX, datasync);
183}
184EXPORT_SYMBOL(vfs_fsync);
185
186static int do_fsync(unsigned int fd, int datasync)
187{
188	struct file *file;
189	int ret = -EBADF;
190
191	file = fget(fd);
192	if (file) {
193		ret = vfs_fsync(file, datasync);
194		fput(file);
195	}
196	return ret;
197}
198
199SYSCALL_DEFINE1(fsync, unsigned int, fd)
200{
201	return do_fsync(fd, 0);
202}
203
204SYSCALL_DEFINE1(fdatasync, unsigned int, fd)
205{
206	return do_fsync(fd, 1);
207}
208
209/**
210 * generic_write_sync - perform syncing after a write if file / inode is sync
211 * @file:	file to which the write happened
212 * @pos:	offset where the write started
213 * @count:	length of the write
214 *
215 * This is just a simple wrapper about our general syncing function.
216 */
217int generic_write_sync(struct file *file, loff_t pos, loff_t count)
218{
219	if (!(file->f_flags & O_DSYNC) && !IS_SYNC(file->f_mapping->host))
220		return 0;
221	return vfs_fsync_range(file, pos, pos + count - 1,
222			       (file->f_flags & __O_SYNC) ? 0 : 1);
223}
224EXPORT_SYMBOL(generic_write_sync);
225
226/*
227 * sys_sync_file_range() permits finely controlled syncing over a segment of
228 * a file in the range offset .. (offset+nbytes-1) inclusive.  If nbytes is
229 * zero then sys_sync_file_range() will operate from offset out to EOF.
230 *
231 * The flag bits are:
232 *
233 * SYNC_FILE_RANGE_WAIT_BEFORE: wait upon writeout of all pages in the range
234 * before performing the write.
235 *
236 * SYNC_FILE_RANGE_WRITE: initiate writeout of all those dirty pages in the
237 * range which are not presently under writeback. Note that this may block for
238 * significant periods due to exhaustion of disk request structures.
239 *
240 * SYNC_FILE_RANGE_WAIT_AFTER: wait upon writeout of all pages in the range
241 * after performing the write.
242 *
243 * Useful combinations of the flag bits are:
244 *
245 * SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE: ensures that all pages
246 * in the range which were dirty on entry to sys_sync_file_range() are placed
247 * under writeout.  This is a start-write-for-data-integrity operation.
248 *
249 * SYNC_FILE_RANGE_WRITE: start writeout of all dirty pages in the range which
250 * are not presently under writeout.  This is an asynchronous flush-to-disk
251 * operation.  Not suitable for data integrity operations.
252 *
253 * SYNC_FILE_RANGE_WAIT_BEFORE (or SYNC_FILE_RANGE_WAIT_AFTER): wait for
254 * completion of writeout of all pages in the range.  This will be used after an
255 * earlier SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE operation to wait
256 * for that operation to complete and to return the result.
257 *
258 * SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE|SYNC_FILE_RANGE_WAIT_AFTER:
259 * a traditional sync() operation.  This is a write-for-data-integrity operation
260 * which will ensure that all pages in the range which were dirty on entry to
261 * sys_sync_file_range() are committed to disk.
262 *
263 *
264 * SYNC_FILE_RANGE_WAIT_BEFORE and SYNC_FILE_RANGE_WAIT_AFTER will detect any
265 * I/O errors or ENOSPC conditions and will return those to the caller, after
266 * clearing the EIO and ENOSPC flags in the address_space.
267 *
268 * It should be noted that none of these operations write out the file's
269 * metadata.  So unless the application is strictly performing overwrites of
270 * already-instantiated disk blocks, there are no guarantees here that the data
271 * will be available after a crash.
272 */
273SYSCALL_DEFINE(sync_file_range)(int fd, loff_t offset, loff_t nbytes,
274				unsigned int flags)
275{
276	int ret;
277	struct file *file;
278	struct address_space *mapping;
279	loff_t endbyte;			/* inclusive */
280	int fput_needed;
281	umode_t i_mode;
282
283	ret = -EINVAL;
284	if (flags & ~VALID_FLAGS)
285		goto out;
286
287	endbyte = offset + nbytes;
288
289	if ((s64)offset < 0)
290		goto out;
291	if ((s64)endbyte < 0)
292		goto out;
293	if (endbyte < offset)
294		goto out;
295
296	if (sizeof(pgoff_t) == 4) {
297		if (offset >= (0x100000000ULL << PAGE_CACHE_SHIFT)) {
298			/*
299			 * The range starts outside a 32 bit machine's
300			 * pagecache addressing capabilities.  Let it "succeed"
301			 */
302			ret = 0;
303			goto out;
304		}
305		if (endbyte >= (0x100000000ULL << PAGE_CACHE_SHIFT)) {
306			/*
307			 * Out to EOF
308			 */
309			nbytes = 0;
310		}
311	}
312
313	if (nbytes == 0)
314		endbyte = LLONG_MAX;
315	else
316		endbyte--;		/* inclusive */
317
318	ret = -EBADF;
319	file = fget_light(fd, &fput_needed);
320	if (!file)
321		goto out;
322
323	i_mode = file->f_path.dentry->d_inode->i_mode;
324	ret = -ESPIPE;
325	if (!S_ISREG(i_mode) && !S_ISBLK(i_mode) && !S_ISDIR(i_mode) &&
326			!S_ISLNK(i_mode))
327		goto out_put;
328
329	mapping = file->f_mapping;
330	if (!mapping) {
331		ret = -EINVAL;
332		goto out_put;
333	}
334
335	ret = 0;
336	if (flags & SYNC_FILE_RANGE_WAIT_BEFORE) {
337		ret = filemap_fdatawait_range(mapping, offset, endbyte);
338		if (ret < 0)
339			goto out_put;
340	}
341
342	if (flags & SYNC_FILE_RANGE_WRITE) {
343		ret = filemap_fdatawrite_range(mapping, offset, endbyte);
344		if (ret < 0)
345			goto out_put;
346	}
347
348	if (flags & SYNC_FILE_RANGE_WAIT_AFTER)
349		ret = filemap_fdatawait_range(mapping, offset, endbyte);
350
351out_put:
352	fput_light(file, fput_needed);
353out:
354	return ret;
355}
356#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
357asmlinkage long SyS_sync_file_range(long fd, loff_t offset, loff_t nbytes,
358				    long flags)
359{
360	return SYSC_sync_file_range((int) fd, offset, nbytes,
361				    (unsigned int) flags);
362}
363SYSCALL_ALIAS(sys_sync_file_range, SyS_sync_file_range);
364#endif
365
366/* It would be nice if people remember that not all the world's an i386
367   when they introduce new system calls */
368SYSCALL_DEFINE(sync_file_range2)(int fd, unsigned int flags,
369				 loff_t offset, loff_t nbytes)
370{
371	return sys_sync_file_range(fd, offset, nbytes, flags);
372}
373#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
374asmlinkage long SyS_sync_file_range2(long fd, long flags,
375				     loff_t offset, loff_t nbytes)
376{
377	return SYSC_sync_file_range2((int) fd, (unsigned int) flags,
378				     offset, nbytes);
379}
380SYSCALL_ALIAS(sys_sync_file_range2, SyS_sync_file_range2);
381#endif
382