1/*
2 * Copyright (C) 2008 Oracle.  All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 *
18 * Based on jffs2 zlib code:
19 * Copyright �� 2001-2007 Red Hat, Inc.
20 * Created by David Woodhouse <dwmw2@infradead.org>
21 */
22
23#include <linux/kernel.h>
24#include <linux/slab.h>
25#include <linux/zlib.h>
26#include <linux/zutil.h>
27#include <linux/vmalloc.h>
28#include <linux/init.h>
29#include <linux/err.h>
30#include <linux/sched.h>
31#include <linux/pagemap.h>
32#include <linux/bio.h>
33#include "compression.h"
34
35/* Plan: call deflate() with avail_in == *sourcelen,
36	avail_out = *dstlen - 12 and flush == Z_FINISH.
37	If it doesn't manage to finish,	call it again with
38	avail_in == 0 and avail_out set to the remaining 12
39	bytes for it to clean up.
40   Q: Is 12 bytes sufficient?
41*/
42#define STREAM_END_SPACE 12
43
44struct workspace {
45	z_stream inf_strm;
46	z_stream def_strm;
47	char *buf;
48	struct list_head list;
49};
50
51static LIST_HEAD(idle_workspace);
52static DEFINE_SPINLOCK(workspace_lock);
53static unsigned long num_workspace;
54static atomic_t alloc_workspace = ATOMIC_INIT(0);
55static DECLARE_WAIT_QUEUE_HEAD(workspace_wait);
56
57/*
58 * this finds an available zlib workspace or allocates a new one
59 * NULL or an ERR_PTR is returned if things go bad.
60 */
61static struct workspace *find_zlib_workspace(void)
62{
63	struct workspace *workspace;
64	int ret;
65	int cpus = num_online_cpus();
66
67again:
68	spin_lock(&workspace_lock);
69	if (!list_empty(&idle_workspace)) {
70		workspace = list_entry(idle_workspace.next, struct workspace,
71				       list);
72		list_del(&workspace->list);
73		num_workspace--;
74		spin_unlock(&workspace_lock);
75		return workspace;
76
77	}
78	spin_unlock(&workspace_lock);
79	if (atomic_read(&alloc_workspace) > cpus) {
80		DEFINE_WAIT(wait);
81		prepare_to_wait(&workspace_wait, &wait, TASK_UNINTERRUPTIBLE);
82		if (atomic_read(&alloc_workspace) > cpus)
83			schedule();
84		finish_wait(&workspace_wait, &wait);
85		goto again;
86	}
87	atomic_inc(&alloc_workspace);
88	workspace = kzalloc(sizeof(*workspace), GFP_NOFS);
89	if (!workspace) {
90		ret = -ENOMEM;
91		goto fail;
92	}
93
94	workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize());
95	if (!workspace->def_strm.workspace) {
96		ret = -ENOMEM;
97		goto fail;
98	}
99	workspace->inf_strm.workspace = vmalloc(zlib_inflate_workspacesize());
100	if (!workspace->inf_strm.workspace) {
101		ret = -ENOMEM;
102		goto fail_inflate;
103	}
104	workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS);
105	if (!workspace->buf) {
106		ret = -ENOMEM;
107		goto fail_kmalloc;
108	}
109	return workspace;
110
111fail_kmalloc:
112	vfree(workspace->inf_strm.workspace);
113fail_inflate:
114	vfree(workspace->def_strm.workspace);
115fail:
116	kfree(workspace);
117	atomic_dec(&alloc_workspace);
118	wake_up(&workspace_wait);
119	return ERR_PTR(ret);
120}
121
122/*
123 * put a workspace struct back on the list or free it if we have enough
124 * idle ones sitting around
125 */
126static int free_workspace(struct workspace *workspace)
127{
128	spin_lock(&workspace_lock);
129	if (num_workspace < num_online_cpus()) {
130		list_add_tail(&workspace->list, &idle_workspace);
131		num_workspace++;
132		spin_unlock(&workspace_lock);
133		if (waitqueue_active(&workspace_wait))
134			wake_up(&workspace_wait);
135		return 0;
136	}
137	spin_unlock(&workspace_lock);
138	vfree(workspace->def_strm.workspace);
139	vfree(workspace->inf_strm.workspace);
140	kfree(workspace->buf);
141	kfree(workspace);
142
143	atomic_dec(&alloc_workspace);
144	if (waitqueue_active(&workspace_wait))
145		wake_up(&workspace_wait);
146	return 0;
147}
148
149/*
150 * cleanup function for module exit
151 */
152static void free_workspaces(void)
153{
154	struct workspace *workspace;
155	while (!list_empty(&idle_workspace)) {
156		workspace = list_entry(idle_workspace.next, struct workspace,
157				       list);
158		list_del(&workspace->list);
159		vfree(workspace->def_strm.workspace);
160		vfree(workspace->inf_strm.workspace);
161		kfree(workspace->buf);
162		kfree(workspace);
163		atomic_dec(&alloc_workspace);
164	}
165}
166
167/*
168 * given an address space and start/len, compress the bytes.
169 *
170 * pages are allocated to hold the compressed result and stored
171 * in 'pages'
172 *
173 * out_pages is used to return the number of pages allocated.  There
174 * may be pages allocated even if we return an error
175 *
176 * total_in is used to return the number of bytes actually read.  It
177 * may be smaller then len if we had to exit early because we
178 * ran out of room in the pages array or because we cross the
179 * max_out threshold.
180 *
181 * total_out is used to return the total number of compressed bytes
182 *
183 * max_out tells us the max number of bytes that we're allowed to
184 * stuff into pages
185 */
186int btrfs_zlib_compress_pages(struct address_space *mapping,
187			      u64 start, unsigned long len,
188			      struct page **pages,
189			      unsigned long nr_dest_pages,
190			      unsigned long *out_pages,
191			      unsigned long *total_in,
192			      unsigned long *total_out,
193			      unsigned long max_out)
194{
195	int ret;
196	struct workspace *workspace;
197	char *data_in;
198	char *cpage_out;
199	int nr_pages = 0;
200	struct page *in_page = NULL;
201	struct page *out_page = NULL;
202	int out_written = 0;
203	int in_read = 0;
204	unsigned long bytes_left;
205
206	*out_pages = 0;
207	*total_out = 0;
208	*total_in = 0;
209
210	workspace = find_zlib_workspace();
211	if (IS_ERR(workspace))
212		return -1;
213
214	if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) {
215		printk(KERN_WARNING "deflateInit failed\n");
216		ret = -1;
217		goto out;
218	}
219
220	workspace->def_strm.total_in = 0;
221	workspace->def_strm.total_out = 0;
222
223	in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
224	data_in = kmap(in_page);
225
226	out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
227	cpage_out = kmap(out_page);
228	pages[0] = out_page;
229	nr_pages = 1;
230
231	workspace->def_strm.next_in = data_in;
232	workspace->def_strm.next_out = cpage_out;
233	workspace->def_strm.avail_out = PAGE_CACHE_SIZE;
234	workspace->def_strm.avail_in = min(len, PAGE_CACHE_SIZE);
235
236	out_written = 0;
237	in_read = 0;
238
239	while (workspace->def_strm.total_in < len) {
240		ret = zlib_deflate(&workspace->def_strm, Z_SYNC_FLUSH);
241		if (ret != Z_OK) {
242			printk(KERN_DEBUG "btrfs deflate in loop returned %d\n",
243			       ret);
244			zlib_deflateEnd(&workspace->def_strm);
245			ret = -1;
246			goto out;
247		}
248
249		/* we're making it bigger, give up */
250		if (workspace->def_strm.total_in > 8192 &&
251		    workspace->def_strm.total_in <
252		    workspace->def_strm.total_out) {
253			ret = -1;
254			goto out;
255		}
256		/* we need another page for writing out.  Test this
257		 * before the total_in so we will pull in a new page for
258		 * the stream end if required
259		 */
260		if (workspace->def_strm.avail_out == 0) {
261			kunmap(out_page);
262			if (nr_pages == nr_dest_pages) {
263				out_page = NULL;
264				ret = -1;
265				goto out;
266			}
267			out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
268			cpage_out = kmap(out_page);
269			pages[nr_pages] = out_page;
270			nr_pages++;
271			workspace->def_strm.avail_out = PAGE_CACHE_SIZE;
272			workspace->def_strm.next_out = cpage_out;
273		}
274		/* we're all done */
275		if (workspace->def_strm.total_in >= len)
276			break;
277
278		/* we've read in a full page, get a new one */
279		if (workspace->def_strm.avail_in == 0) {
280			if (workspace->def_strm.total_out > max_out)
281				break;
282
283			bytes_left = len - workspace->def_strm.total_in;
284			kunmap(in_page);
285			page_cache_release(in_page);
286
287			start += PAGE_CACHE_SIZE;
288			in_page = find_get_page(mapping,
289						start >> PAGE_CACHE_SHIFT);
290			data_in = kmap(in_page);
291			workspace->def_strm.avail_in = min(bytes_left,
292							   PAGE_CACHE_SIZE);
293			workspace->def_strm.next_in = data_in;
294		}
295	}
296	workspace->def_strm.avail_in = 0;
297	ret = zlib_deflate(&workspace->def_strm, Z_FINISH);
298	zlib_deflateEnd(&workspace->def_strm);
299
300	if (ret != Z_STREAM_END) {
301		ret = -1;
302		goto out;
303	}
304
305	if (workspace->def_strm.total_out >= workspace->def_strm.total_in) {
306		ret = -1;
307		goto out;
308	}
309
310	ret = 0;
311	*total_out = workspace->def_strm.total_out;
312	*total_in = workspace->def_strm.total_in;
313out:
314	*out_pages = nr_pages;
315	if (out_page)
316		kunmap(out_page);
317
318	if (in_page) {
319		kunmap(in_page);
320		page_cache_release(in_page);
321	}
322	free_workspace(workspace);
323	return ret;
324}
325
326/*
327 * pages_in is an array of pages with compressed data.
328 *
329 * disk_start is the starting logical offset of this array in the file
330 *
331 * bvec is a bio_vec of pages from the file that we want to decompress into
332 *
333 * vcnt is the count of pages in the biovec
334 *
335 * srclen is the number of bytes in pages_in
336 *
337 * The basic idea is that we have a bio that was created by readpages.
338 * The pages in the bio are for the uncompressed data, and they may not
339 * be contiguous.  They all correspond to the range of bytes covered by
340 * the compressed extent.
341 */
342int btrfs_zlib_decompress_biovec(struct page **pages_in,
343			      u64 disk_start,
344			      struct bio_vec *bvec,
345			      int vcnt,
346			      size_t srclen)
347{
348	int ret = 0;
349	int wbits = MAX_WBITS;
350	struct workspace *workspace;
351	char *data_in;
352	size_t total_out = 0;
353	unsigned long page_bytes_left;
354	unsigned long page_in_index = 0;
355	unsigned long page_out_index = 0;
356	struct page *page_out;
357	unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) /
358					PAGE_CACHE_SIZE;
359	unsigned long buf_start;
360	unsigned long buf_offset;
361	unsigned long bytes;
362	unsigned long working_bytes;
363	unsigned long pg_offset;
364	unsigned long start_byte;
365	unsigned long current_buf_start;
366	char *kaddr;
367
368	workspace = find_zlib_workspace();
369	if (IS_ERR(workspace))
370		return -ENOMEM;
371
372	data_in = kmap(pages_in[page_in_index]);
373	workspace->inf_strm.next_in = data_in;
374	workspace->inf_strm.avail_in = min_t(size_t, srclen, PAGE_CACHE_SIZE);
375	workspace->inf_strm.total_in = 0;
376
377	workspace->inf_strm.total_out = 0;
378	workspace->inf_strm.next_out = workspace->buf;
379	workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
380	page_out = bvec[page_out_index].bv_page;
381	page_bytes_left = PAGE_CACHE_SIZE;
382	pg_offset = 0;
383
384	/* If it's deflate, and it's got no preset dictionary, then
385	   we can tell zlib to skip the adler32 check. */
386	if (srclen > 2 && !(data_in[1] & PRESET_DICT) &&
387	    ((data_in[0] & 0x0f) == Z_DEFLATED) &&
388	    !(((data_in[0]<<8) + data_in[1]) % 31)) {
389
390		wbits = -((data_in[0] >> 4) + 8);
391		workspace->inf_strm.next_in += 2;
392		workspace->inf_strm.avail_in -= 2;
393	}
394
395	if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
396		printk(KERN_WARNING "inflateInit failed\n");
397		ret = -1;
398		goto out;
399	}
400	while (workspace->inf_strm.total_in < srclen) {
401		ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
402		if (ret != Z_OK && ret != Z_STREAM_END)
403			break;
404		/*
405		 * buf start is the byte offset we're of the start of
406		 * our workspace buffer
407		 */
408		buf_start = total_out;
409
410		/* total_out is the last byte of the workspace buffer */
411		total_out = workspace->inf_strm.total_out;
412
413		working_bytes = total_out - buf_start;
414
415		/*
416		 * start byte is the first byte of the page we're currently
417		 * copying into relative to the start of the compressed data.
418		 */
419		start_byte = page_offset(page_out) - disk_start;
420
421		if (working_bytes == 0) {
422			/* we didn't make progress in this inflate
423			 * call, we're done
424			 */
425			if (ret != Z_STREAM_END)
426				ret = -1;
427			break;
428		}
429
430		/* we haven't yet hit data corresponding to this page */
431		if (total_out <= start_byte)
432			goto next;
433
434		/*
435		 * the start of the data we care about is offset into
436		 * the middle of our working buffer
437		 */
438		if (total_out > start_byte && buf_start < start_byte) {
439			buf_offset = start_byte - buf_start;
440			working_bytes -= buf_offset;
441		} else {
442			buf_offset = 0;
443		}
444		current_buf_start = buf_start;
445
446		/* copy bytes from the working buffer into the pages */
447		while (working_bytes > 0) {
448			bytes = min(PAGE_CACHE_SIZE - pg_offset,
449				    PAGE_CACHE_SIZE - buf_offset);
450			bytes = min(bytes, working_bytes);
451			kaddr = kmap_atomic(page_out, KM_USER0);
452			memcpy(kaddr + pg_offset, workspace->buf + buf_offset,
453			       bytes);
454			kunmap_atomic(kaddr, KM_USER0);
455			flush_dcache_page(page_out);
456
457			pg_offset += bytes;
458			page_bytes_left -= bytes;
459			buf_offset += bytes;
460			working_bytes -= bytes;
461			current_buf_start += bytes;
462
463			/* check if we need to pick another page */
464			if (page_bytes_left == 0) {
465				page_out_index++;
466				if (page_out_index >= vcnt) {
467					ret = 0;
468					goto done;
469				}
470
471				page_out = bvec[page_out_index].bv_page;
472				pg_offset = 0;
473				page_bytes_left = PAGE_CACHE_SIZE;
474				start_byte = page_offset(page_out) - disk_start;
475
476				/*
477				 * make sure our new page is covered by this
478				 * working buffer
479				 */
480				if (total_out <= start_byte)
481					goto next;
482
483				/* the next page in the biovec might not
484				 * be adjacent to the last page, but it
485				 * might still be found inside this working
486				 * buffer.  bump our offset pointer
487				 */
488				if (total_out > start_byte &&
489				    current_buf_start < start_byte) {
490					buf_offset = start_byte - buf_start;
491					working_bytes = total_out - start_byte;
492					current_buf_start = buf_start +
493						buf_offset;
494				}
495			}
496		}
497next:
498		workspace->inf_strm.next_out = workspace->buf;
499		workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
500
501		if (workspace->inf_strm.avail_in == 0) {
502			unsigned long tmp;
503			kunmap(pages_in[page_in_index]);
504			page_in_index++;
505			if (page_in_index >= total_pages_in) {
506				data_in = NULL;
507				break;
508			}
509			data_in = kmap(pages_in[page_in_index]);
510			workspace->inf_strm.next_in = data_in;
511			tmp = srclen - workspace->inf_strm.total_in;
512			workspace->inf_strm.avail_in = min(tmp,
513							   PAGE_CACHE_SIZE);
514		}
515	}
516	if (ret != Z_STREAM_END)
517		ret = -1;
518	else
519		ret = 0;
520done:
521	zlib_inflateEnd(&workspace->inf_strm);
522	if (data_in)
523		kunmap(pages_in[page_in_index]);
524out:
525	free_workspace(workspace);
526	return ret;
527}
528
529/*
530 * a less complex decompression routine.  Our compressed data fits in a
531 * single page, and we want to read a single page out of it.
532 * start_byte tells us the offset into the compressed data we're interested in
533 */
534int btrfs_zlib_decompress(unsigned char *data_in,
535			  struct page *dest_page,
536			  unsigned long start_byte,
537			  size_t srclen, size_t destlen)
538{
539	int ret = 0;
540	int wbits = MAX_WBITS;
541	struct workspace *workspace;
542	unsigned long bytes_left = destlen;
543	unsigned long total_out = 0;
544	char *kaddr;
545
546	if (destlen > PAGE_CACHE_SIZE)
547		return -ENOMEM;
548
549	workspace = find_zlib_workspace();
550	if (IS_ERR(workspace))
551		return -ENOMEM;
552
553	workspace->inf_strm.next_in = data_in;
554	workspace->inf_strm.avail_in = srclen;
555	workspace->inf_strm.total_in = 0;
556
557	workspace->inf_strm.next_out = workspace->buf;
558	workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
559	workspace->inf_strm.total_out = 0;
560	/* If it's deflate, and it's got no preset dictionary, then
561	   we can tell zlib to skip the adler32 check. */
562	if (srclen > 2 && !(data_in[1] & PRESET_DICT) &&
563	    ((data_in[0] & 0x0f) == Z_DEFLATED) &&
564	    !(((data_in[0]<<8) + data_in[1]) % 31)) {
565
566		wbits = -((data_in[0] >> 4) + 8);
567		workspace->inf_strm.next_in += 2;
568		workspace->inf_strm.avail_in -= 2;
569	}
570
571	if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
572		printk(KERN_WARNING "inflateInit failed\n");
573		ret = -1;
574		goto out;
575	}
576
577	while (bytes_left > 0) {
578		unsigned long buf_start;
579		unsigned long buf_offset;
580		unsigned long bytes;
581		unsigned long pg_offset = 0;
582
583		ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
584		if (ret != Z_OK && ret != Z_STREAM_END)
585			break;
586
587		buf_start = total_out;
588		total_out = workspace->inf_strm.total_out;
589
590		if (total_out == buf_start) {
591			ret = -1;
592			break;
593		}
594
595		if (total_out <= start_byte)
596			goto next;
597
598		if (total_out > start_byte && buf_start < start_byte)
599			buf_offset = start_byte - buf_start;
600		else
601			buf_offset = 0;
602
603		bytes = min(PAGE_CACHE_SIZE - pg_offset,
604			    PAGE_CACHE_SIZE - buf_offset);
605		bytes = min(bytes, bytes_left);
606
607		kaddr = kmap_atomic(dest_page, KM_USER0);
608		memcpy(kaddr + pg_offset, workspace->buf + buf_offset, bytes);
609		kunmap_atomic(kaddr, KM_USER0);
610
611		pg_offset += bytes;
612		bytes_left -= bytes;
613next:
614		workspace->inf_strm.next_out = workspace->buf;
615		workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
616	}
617
618	if (ret != Z_STREAM_END && bytes_left != 0)
619		ret = -1;
620	else
621		ret = 0;
622
623	zlib_inflateEnd(&workspace->inf_strm);
624out:
625	free_workspace(workspace);
626	return ret;
627}
628
629void btrfs_zlib_exit(void)
630{
631    free_workspaces();
632}
633