1/*
2 * High-level sync()-related operations
3 */
4
5#include <linux/kernel.h>
6#include <linux/file.h>
7#include <linux/fs.h>
8#include <linux/slab.h>
9#include <linux/module.h>
10#include <linux/sched.h>
11#include <linux/writeback.h>
12#include <linux/syscalls.h>
13#include <linux/linkage.h>
14#include <linux/pagemap.h>
15#include <linux/quotaops.h>
16#include <linux/buffer_head.h>
17#include <linux/backing-dev.h>
18#include "internal.h"
19
20#define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \
21			SYNC_FILE_RANGE_WAIT_AFTER)
22
23/*
24 * Do the filesystem syncing work. For simple filesystems
25 * writeback_inodes_sb(sb) just dirties buffers with inodes so we have to
26 * submit IO for these buffers via __sync_blockdev(). This also speeds up the
27 * wait == 1 case since in that case write_inode() functions do
28 * sync_dirty_buffer() and thus effectively write one block at a time.
29 */
30static int __sync_filesystem(struct super_block *sb, int wait)
31{
32	/*
33	 * This should be safe, as we require bdi backing to actually
34	 * write out data in the first place
35	 */
36	/* if (!sb->s_bdi || sb->s_bdi == &noop_backing_dev_info) */
37	if (sb->s_bdi == &noop_backing_dev_info) /* foxconn modified */
38		return 0;
39
40	if (sb->s_qcop && sb->s_qcop->quota_sync)
41		sb->s_qcop->quota_sync(sb, -1, wait);
42
43	if (wait)
44		sync_inodes_sb(sb);
45	else
46		writeback_inodes_sb(sb);
47
48	if (sb->s_op->sync_fs)
49		sb->s_op->sync_fs(sb, wait);
50	return __sync_blockdev(sb->s_bdev, wait);
51}
52
53/*
54 * Write out and wait upon all dirty data associated with this
55 * superblock.  Filesystem data as well as the underlying block
56 * device.  Takes the superblock lock.
57 */
58int sync_filesystem(struct super_block *sb)
59{
60	int ret;
61
62	/*
63	 * We need to be protected against the filesystem going from
64	 * r/o to r/w or vice versa.
65	 */
66	WARN_ON(!rwsem_is_locked(&sb->s_umount));
67
68	/*
69	 * No point in syncing out anything if the filesystem is read-only.
70	 */
71	if (sb->s_flags & MS_RDONLY)
72		return 0;
73
74	ret = __sync_filesystem(sb, 0);
75	if (ret < 0)
76		return ret;
77	return __sync_filesystem(sb, 1);
78}
79EXPORT_SYMBOL_GPL(sync_filesystem);
80
81static void sync_one_sb(struct super_block *sb, void *arg)
82{
83	/* if (!(sb->s_flags & MS_RDONLY) && sb->s_bdi) */
84	if (!(sb->s_flags & MS_RDONLY)) /* foxconn modified */
85		__sync_filesystem(sb, *(int *)arg);
86}
87/*
88 * Sync all the data for all the filesystems (called by sys_sync() and
89 * emergency sync)
90 */
91static void sync_filesystems(int wait)
92{
93	iterate_supers(sync_one_sb, &wait);
94}
95
96/*
97 * sync everything.  Start out by waking pdflush, because that writes back
98 * all queues in parallel.
99 */
100SYSCALL_DEFINE0(sync)
101{
102	wakeup_flusher_threads(0);
103	sync_filesystems(0);
104	sync_filesystems(1);
105	if (unlikely(laptop_mode))
106		laptop_sync_completion();
107	return 0;
108}
109
110static void do_sync_work(struct work_struct *work)
111{
112	/*
113	 * Sync twice to reduce the possibility we skipped some inodes / pages
114	 * because they were temporarily locked
115	 */
116	sync_filesystems(0);
117	sync_filesystems(0);
118	printk("Emergency Sync complete\n");
119	kfree(work);
120}
121
122void emergency_sync(void)
123{
124	struct work_struct *work;
125
126	work = kmalloc(sizeof(*work), GFP_ATOMIC);
127	if (work) {
128		INIT_WORK(work, do_sync_work);
129		schedule_work(work);
130	}
131}
132
133/**
134 * vfs_fsync_range - helper to sync a range of data & metadata to disk
135 * @file:		file to sync
136 * @start:		offset in bytes of the beginning of data range to sync
137 * @end:		offset in bytes of the end of data range (inclusive)
138 * @datasync:		perform only datasync
139 *
140 * Write back data in range @start..@end and metadata for @file to disk.  If
141 * @datasync is set only metadata needed to access modified file data is
142 * written.
143 */
144int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync)
145{
146	struct address_space *mapping = file->f_mapping;
147	int err, ret;
148
149	if (!file->f_op || !file->f_op->fsync) {
150		ret = -EINVAL;
151		goto out;
152	}
153
154	ret = filemap_write_and_wait_range(mapping, start, end);
155
156	/*
157	 * We need to protect against concurrent writers, which could cause
158	 * livelocks in fsync_buffers_list().
159	 */
160	mutex_lock(&mapping->host->i_mutex);
161	err = file->f_op->fsync(file, datasync);
162	if (!ret)
163		ret = err;
164	mutex_unlock(&mapping->host->i_mutex);
165
166out:
167	return ret;
168}
169EXPORT_SYMBOL(vfs_fsync_range);
170
171/**
172 * vfs_fsync - perform a fsync or fdatasync on a file
173 * @file:		file to sync
174 * @datasync:		only perform a fdatasync operation
175 *
176 * Write back data and metadata for @file to disk.  If @datasync is
177 * set only metadata needed to access modified file data is written.
178 */
179int vfs_fsync(struct file *file, int datasync)
180{
181	return vfs_fsync_range(file, 0, LLONG_MAX, datasync);
182}
183EXPORT_SYMBOL(vfs_fsync);
184
185static int do_fsync(unsigned int fd, int datasync)
186{
187	struct file *file;
188	int ret = -EBADF;
189
190	file = fget(fd);
191	if (file) {
192		ret = vfs_fsync(file, datasync);
193		fput(file);
194	}
195	return ret;
196}
197
198SYSCALL_DEFINE1(fsync, unsigned int, fd)
199{
200	return do_fsync(fd, 0);
201}
202
203SYSCALL_DEFINE1(fdatasync, unsigned int, fd)
204{
205	return do_fsync(fd, 1);
206}
207
208/**
209 * generic_write_sync - perform syncing after a write if file / inode is sync
210 * @file:	file to which the write happened
211 * @pos:	offset where the write started
212 * @count:	length of the write
213 *
214 * This is just a simple wrapper about our general syncing function.
215 */
216int generic_write_sync(struct file *file, loff_t pos, loff_t count)
217{
218	if (!(file->f_flags & O_DSYNC) && !IS_SYNC(file->f_mapping->host))
219		return 0;
220	return vfs_fsync_range(file, pos, pos + count - 1,
221			       (file->f_flags & __O_SYNC) ? 0 : 1);
222}
223EXPORT_SYMBOL(generic_write_sync);
224
225/*
226 * sys_sync_file_range() permits finely controlled syncing over a segment of
227 * a file in the range offset .. (offset+nbytes-1) inclusive.  If nbytes is
228 * zero then sys_sync_file_range() will operate from offset out to EOF.
229 *
230 * The flag bits are:
231 *
232 * SYNC_FILE_RANGE_WAIT_BEFORE: wait upon writeout of all pages in the range
233 * before performing the write.
234 *
235 * SYNC_FILE_RANGE_WRITE: initiate writeout of all those dirty pages in the
236 * range which are not presently under writeback. Note that this may block for
237 * significant periods due to exhaustion of disk request structures.
238 *
239 * SYNC_FILE_RANGE_WAIT_AFTER: wait upon writeout of all pages in the range
240 * after performing the write.
241 *
242 * Useful combinations of the flag bits are:
243 *
244 * SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE: ensures that all pages
245 * in the range which were dirty on entry to sys_sync_file_range() are placed
246 * under writeout.  This is a start-write-for-data-integrity operation.
247 *
248 * SYNC_FILE_RANGE_WRITE: start writeout of all dirty pages in the range which
249 * are not presently under writeout.  This is an asynchronous flush-to-disk
250 * operation.  Not suitable for data integrity operations.
251 *
252 * SYNC_FILE_RANGE_WAIT_BEFORE (or SYNC_FILE_RANGE_WAIT_AFTER): wait for
253 * completion of writeout of all pages in the range.  This will be used after an
254 * earlier SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE operation to wait
255 * for that operation to complete and to return the result.
256 *
257 * SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE|SYNC_FILE_RANGE_WAIT_AFTER:
258 * a traditional sync() operation.  This is a write-for-data-integrity operation
259 * which will ensure that all pages in the range which were dirty on entry to
260 * sys_sync_file_range() are committed to disk.
261 *
262 *
263 * SYNC_FILE_RANGE_WAIT_BEFORE and SYNC_FILE_RANGE_WAIT_AFTER will detect any
264 * I/O errors or ENOSPC conditions and will return those to the caller, after
265 * clearing the EIO and ENOSPC flags in the address_space.
266 *
267 * It should be noted that none of these operations write out the file's
268 * metadata.  So unless the application is strictly performing overwrites of
269 * already-instantiated disk blocks, there are no guarantees here that the data
270 * will be available after a crash.
271 */
272SYSCALL_DEFINE(sync_file_range)(int fd, loff_t offset, loff_t nbytes,
273				unsigned int flags)
274{
275	int ret;
276	struct file *file;
277	struct address_space *mapping;
278	loff_t endbyte;			/* inclusive */
279	int fput_needed;
280	umode_t i_mode;
281
282	ret = -EINVAL;
283	if (flags & ~VALID_FLAGS)
284		goto out;
285
286	endbyte = offset + nbytes;
287
288	if ((s64)offset < 0)
289		goto out;
290	if ((s64)endbyte < 0)
291		goto out;
292	if (endbyte < offset)
293		goto out;
294
295	if (sizeof(pgoff_t) == 4) {
296		if (offset >= (0x100000000ULL << PAGE_CACHE_SHIFT)) {
297			/*
298			 * The range starts outside a 32 bit machine's
299			 * pagecache addressing capabilities.  Let it "succeed"
300			 */
301			ret = 0;
302			goto out;
303		}
304		if (endbyte >= (0x100000000ULL << PAGE_CACHE_SHIFT)) {
305			/*
306			 * Out to EOF
307			 */
308			nbytes = 0;
309		}
310	}
311
312	if (nbytes == 0)
313		endbyte = LLONG_MAX;
314	else
315		endbyte--;		/* inclusive */
316
317	ret = -EBADF;
318	file = fget_light(fd, &fput_needed);
319	if (!file)
320		goto out;
321
322	i_mode = file->f_path.dentry->d_inode->i_mode;
323	ret = -ESPIPE;
324	if (!S_ISREG(i_mode) && !S_ISBLK(i_mode) && !S_ISDIR(i_mode) &&
325			!S_ISLNK(i_mode))
326		goto out_put;
327
328	mapping = file->f_mapping;
329	if (!mapping) {
330		ret = -EINVAL;
331		goto out_put;
332	}
333
334	ret = 0;
335	if (flags & SYNC_FILE_RANGE_WAIT_BEFORE) {
336		ret = filemap_fdatawait_range(mapping, offset, endbyte);
337		if (ret < 0)
338			goto out_put;
339	}
340
341	if (flags & SYNC_FILE_RANGE_WRITE) {
342		ret = filemap_fdatawrite_range(mapping, offset, endbyte);
343		if (ret < 0)
344			goto out_put;
345	}
346
347	if (flags & SYNC_FILE_RANGE_WAIT_AFTER)
348		ret = filemap_fdatawait_range(mapping, offset, endbyte);
349
350out_put:
351	fput_light(file, fput_needed);
352out:
353	return ret;
354}
355#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
356asmlinkage long SyS_sync_file_range(long fd, loff_t offset, loff_t nbytes,
357				    long flags)
358{
359	return SYSC_sync_file_range((int) fd, offset, nbytes,
360				    (unsigned int) flags);
361}
362SYSCALL_ALIAS(sys_sync_file_range, SyS_sync_file_range);
363#endif
364
365/* It would be nice if people remember that not all the world's an i386
366   when they introduce new system calls */
367SYSCALL_DEFINE(sync_file_range2)(int fd, unsigned int flags,
368				 loff_t offset, loff_t nbytes)
369{
370	return sys_sync_file_range(fd, offset, nbytes, flags);
371}
372#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
373asmlinkage long SyS_sync_file_range2(long fd, long flags,
374				     loff_t offset, loff_t nbytes)
375{
376	return SYSC_sync_file_range2((int) fd, (unsigned int) flags,
377				     offset, nbytes);
378}
379SYSCALL_ALIAS(sys_sync_file_range2, SyS_sync_file_range2);
380#endif
381