1/*	$NetBSD: offtab.c,v 1.15 2017/07/29 21:04:07 riastradh Exp $	*/
2
3/*-
4 * Copyright (c) 2014 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Taylor R. Campbell.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32#include <sys/cdefs.h>
33__RCSID("$NetBSD: offtab.c,v 1.15 2017/07/29 21:04:07 riastradh Exp $");
34
35#include <sys/types.h>
36#include <sys/endian.h>
37
38#include <assert.h>
39#include <err.h>
40#include <errno.h>
41#include <inttypes.h>
42#include <limits.h>
43#include <stdbool.h>
44#include <stdlib.h>
45#include <unistd.h>
46
47#include "common.h"
48#include "utils.h"
49
50#include "offtab.h"
51
52static void __printflike(1,2) __dead
53offtab_bug(const char *fmt, ...)
54{
55
56	errx(1, "bug in offtab, please report");
57}
58
59static void __printflike(1,2) __dead
60offtab_bugx(const char *fmt, ...)
61{
62
63	errx(1, "bug in offtab, please report");
64}
65
66static uint32_t
67offtab_compute_window_size(struct offtab *offtab, uint32_t start)
68{
69
70	assert(start < offtab->ot_n_offsets);
71	return MIN(offtab->ot_window_size, (offtab->ot_n_offsets - start));
72}
73
74static uint32_t
75offtab_current_window_size(struct offtab *offtab)
76{
77
78	return offtab_compute_window_size(offtab, offtab->ot_window_start);
79}
80
81static uint32_t
82offtab_current_window_end(struct offtab *offtab)
83{
84
85	assert(offtab->ot_window_start < offtab->ot_n_offsets);
86	assert(offtab_current_window_size(offtab) <=
87	    (offtab->ot_n_offsets - offtab->ot_window_start));
88	return (offtab->ot_window_start + offtab_current_window_size(offtab));
89}
90
91static void
92offtab_compute_window_position(struct offtab *offtab, uint32_t window_start,
93    size_t *bytes, off_t *pos)
94{
95	const uint32_t window_size = offtab_compute_window_size(offtab,
96	    window_start);
97
98	__CTASSERT(MUL_OK(size_t, MAX_WINDOW_SIZE, sizeof(uint64_t)));
99	*bytes = (window_size * sizeof(uint64_t));
100
101	assert(window_start <= offtab->ot_n_offsets);
102	__CTASSERT(MUL_OK(off_t, MAX_N_OFFSETS, sizeof(uint64_t)));
103	const off_t window_offset = ((off_t)window_start *
104	    (off_t)sizeof(uint64_t));
105
106	assert(offtab->ot_fdpos <= OFFTAB_MAX_FDPOS);
107	__CTASSERT(ADD_OK(off_t, OFFTAB_MAX_FDPOS,
108		(off_t)MAX_N_OFFSETS*sizeof(uint64_t)));
109	assert(ADD_OK(off_t, offtab->ot_fdpos, window_offset));
110	*pos = (offtab->ot_fdpos + window_offset);
111}
112
113#define	OFFTAB_READ_SEEK	0x01
114#define	OFFTAB_READ_NOSEEK	0x00
115
116static bool
117offtab_read_window(struct offtab *offtab, uint32_t blkno, int read_flags)
118{
119	const uint32_t window_start = rounddown(blkno, offtab->ot_window_size);
120	size_t window_bytes;
121	off_t window_pos;
122
123	assert(offtab->ot_mode == OFFTAB_MODE_READ);
124	assert(ISSET(read_flags, OFFTAB_READ_SEEK) ||
125	    (lseek(offtab->ot_fd, 0, SEEK_CUR) == offtab->ot_fdpos) ||
126	    ((lseek(offtab->ot_fd, 0, SEEK_CUR) == -1) && (errno == ESPIPE)));
127
128	offtab_compute_window_position(offtab, window_start,
129	    &window_bytes, &window_pos);
130	const ssize_t n_read = (ISSET(read_flags, OFFTAB_READ_SEEK)
131	    ? pread_block(offtab->ot_fd, offtab->ot_window, window_bytes,
132		window_pos)
133	    : read_block(offtab->ot_fd, offtab->ot_window, window_bytes));
134	if (n_read == -1) {
135		(*offtab->ot_report)("read offset table at %"PRIuMAX,
136		    (uintmax_t)window_pos);
137		return false;
138	}
139	assert(n_read >= 0);
140	if ((size_t)n_read != window_bytes) {
141		(*offtab->ot_reportx)("partial read of offset table"
142		    " at %"PRIuMAX": %zu != %zu",
143		    (uintmax_t)window_pos, (size_t)n_read, window_bytes);
144		return false;
145	}
146
147	offtab->ot_window_start = window_start;
148
149	return true;
150}
151
152static bool
153offtab_maybe_read_window(struct offtab *offtab, uint32_t blkno, int read_flags)
154{
155
156	/* Don't bother if blkno is already in the window.  */
157	if ((offtab->ot_window_start <= blkno) &&
158	    (blkno < offtab_current_window_end(offtab)))
159		return true;
160
161	if (!offtab_read_window(offtab, blkno, read_flags))
162		return false;
163
164	return true;
165}
166
167static void
168offtab_write_window(struct offtab *offtab)
169{
170	size_t window_bytes;
171	off_t window_pos;
172
173	assert(offtab->ot_mode == OFFTAB_MODE_WRITE);
174
175	offtab_compute_window_position(offtab, offtab->ot_window_start,
176	    &window_bytes, &window_pos);
177	const ssize_t n_written = pwrite(offtab->ot_fd, offtab->ot_window,
178	    window_bytes, window_pos);
179	if (n_written == -1)
180		err_ss(1, "write initial offset table");
181	assert(n_written >= 0);
182	if ((size_t)n_written != window_bytes)
183		errx_ss(1, "partial write of initial offset bytes: %zu <= %zu",
184		    (size_t)n_written,
185		    window_bytes);
186}
187
188static void
189offtab_maybe_write_window(struct offtab *offtab, uint32_t start, uint32_t end)
190{
191
192	/* Don't bother if [start, end) does not cover our window.  */
193	if (end <= offtab->ot_window_start)
194		return;
195	if (offtab_current_window_end(offtab) < start)
196		return;
197
198	offtab_write_window(offtab);
199}
200
201/*
202 * Initialize an offtab to support the specified number of offsets read
203 * to or written from fd at byte position fdpos.
204 */
205void
206offtab_init(struct offtab *offtab, uint32_t n_offsets, uint32_t window_size,
207    int fd, off_t fdpos)
208{
209
210	assert(offtab != NULL);
211	assert(0 < n_offsets);
212	assert(0 <= fd);
213	assert(0 <= fdpos);
214	assert(fdpos <= OFFTAB_MAX_FDPOS);
215
216	offtab->ot_n_offsets = n_offsets;
217	if ((window_size == 0) || (n_offsets < window_size))
218		offtab->ot_window_size = n_offsets;
219	else
220		offtab->ot_window_size = window_size;
221	assert(offtab->ot_window_size <= offtab->ot_n_offsets);
222	offtab->ot_window_start = (uint32_t)-1;
223	__CTASSERT(MUL_OK(size_t, MAX_WINDOW_SIZE, sizeof(uint64_t)));
224	offtab->ot_window = malloc(offtab->ot_window_size * sizeof(uint64_t));
225	if (offtab->ot_window == NULL)
226		err(1, "malloc offset table");
227	offtab->ot_blkno = (uint32_t)-1;
228	offtab->ot_fd = fd;
229	offtab->ot_fdpos = fdpos;
230	offtab->ot_report = &offtab_bug;
231	offtab->ot_reportx = &offtab_bugx;
232	offtab->ot_mode = OFFTAB_MODE_NONE;
233}
234
235/*
236 * Destroy an offtab.
237 */
238void
239offtab_destroy(struct offtab *offtab)
240{
241
242	free(offtab->ot_window);
243}
244
245/*
246 * For an offtab that has been used to read data from disk, convert it
247 * to an offtab that can be used to write subsequent data to disk.
248 * blkno is the last valid blkno read from disk.
249 */
250bool
251offtab_transmogrify_read_to_write(struct offtab *offtab, uint32_t blkno)
252{
253
254	assert(offtab->ot_mode == OFFTAB_MODE_READ);
255	assert(0 < blkno);
256
257	if (!offtab_maybe_read_window(offtab, blkno, OFFTAB_READ_SEEK))
258		return false;
259
260	offtab->ot_mode = OFFTAB_MODE_WRITE;
261	offtab->ot_blkno = blkno;
262
263	return true;
264}
265
266/*
267 * Reset an offtab for reading an offset table from the beginning.
268 * Initializes in-memory state and may read data from offtab->ot_fd,
269 * which must currently be at byte position offtab->ot_fdpos.  Failure
270 * will be reported by the report/reportx routines, which are called
271 * like warn/warnx.  May fail; returns true on success, false on
272 * failure.
273 *
274 * This almost has copypasta of offtab_prepare_get, but this uses read,
275 * rather than pread, so that it will work on nonseekable input if the
276 * window is the whole offset table.
277 */
278bool
279offtab_reset_read(struct offtab *offtab,
280    void (*report)(const char *, ...) __printflike(1,2),
281    void (*reportx)(const char *, ...) __printflike(1,2))
282{
283
284	assert((lseek(offtab->ot_fd, 0, SEEK_CUR) == offtab->ot_fdpos) ||
285	    ((lseek(offtab->ot_fd, 0, SEEK_CUR) == -1) && (errno == ESPIPE)));
286
287	offtab->ot_report = report;
288	offtab->ot_reportx = reportx;
289	offtab->ot_mode = OFFTAB_MODE_READ;
290	offtab->ot_blkno = (uint32_t)-1;
291
292	if (!offtab_read_window(offtab, 0, OFFTAB_READ_NOSEEK))
293		return false;
294
295	if (offtab->ot_window_size < offtab->ot_n_offsets) {
296		__CTASSERT(MUL_OK(off_t, MAX_N_OFFSETS, sizeof(uint64_t)));
297		const off_t offtab_bytes = ((off_t)offtab->ot_n_offsets *
298		    (off_t)sizeof(uint64_t));
299		assert(offtab->ot_fdpos <= OFFTAB_MAX_FDPOS);
300		__CTASSERT(ADD_OK(off_t, OFFTAB_MAX_FDPOS,
301			(off_t)MAX_N_OFFSETS*sizeof(uint64_t)));
302		assert(ADD_OK(off_t, offtab->ot_fdpos, offtab_bytes));
303		const off_t first_offset = (offtab->ot_fdpos + offtab_bytes);
304		if (lseek(offtab->ot_fd, first_offset, SEEK_SET) == -1) {
305			(*offtab->ot_report)("lseek to first offset 0x%"PRIx64,
306			    first_offset);
307			return false;
308		}
309	}
310
311	return true;
312}
313
314/*
315 * Do any I/O or bookkeeping necessary to fetch the offset for blkno in
316 * preparation for a call to offtab_get.  May fail; returns true on
317 * success, false on failure.
318 */
319bool
320offtab_prepare_get(struct offtab *offtab, uint32_t blkno)
321{
322
323	assert(offtab->ot_mode == OFFTAB_MODE_READ);
324	assert(blkno < offtab->ot_n_offsets);
325
326	if (!offtab_maybe_read_window(offtab, blkno, OFFTAB_READ_SEEK))
327		return false;
328
329	assert(offtab->ot_window_start <= blkno);
330	assert(blkno < offtab_current_window_end(offtab));
331
332	offtab->ot_blkno = blkno;
333	return true;
334}
335
336/*
337 * Return the offset for blkno.  Caller must have called
338 * offtab_prepare_get beforehand.
339 */
340uint64_t
341offtab_get(struct offtab *offtab, uint32_t blkno)
342{
343
344	assert(offtab->ot_mode == OFFTAB_MODE_READ);
345	assert(blkno == offtab->ot_blkno);
346	assert(offtab->ot_window_start <= blkno);
347	assert(blkno < offtab_current_window_end(offtab));
348
349	return be64toh(offtab->ot_window[blkno - offtab->ot_window_start]);
350}
351
352/*
353 * Reset offtab for writing a fresh offset table.  Initializes
354 * in-memory state and writes an empty offset table to offtab->ot_fd,
355 * which must currently be at byte position offtab->ot_fdpos.  May
356 * fail; returns on success, aborts with err(3) on failure.
357 */
358void
359offtab_reset_write(struct offtab *offtab)
360{
361	uint32_t i;
362
363	assert(lseek(offtab->ot_fd, 0, SEEK_CUR) == offtab->ot_fdpos);
364
365	offtab->ot_mode = OFFTAB_MODE_WRITE;
366	offtab->ot_blkno = (uint32_t)-1;
367
368	/*
369	 * Initialize the offset table to all ones (except for the
370	 * fixed first offset) so that we can easily detect where we
371	 * were interrupted if we want to restart.
372	 */
373	__CTASSERT(MAX_N_OFFSETS <= UINT32_MAX);
374	assert(offtab->ot_n_offsets > 0);
375
376	/* Initialize window of all ones.  */
377	for (i = 0; i < offtab->ot_window_size; i++)
378		offtab->ot_window[i] = ~(uint64_t)0;
379
380	/* Write the window to every position in the table.  */
381	const uint32_t n_windows =
382	    howmany(offtab->ot_n_offsets, offtab->ot_window_size);
383	for (i = 1; i < n_windows; i++) {
384		/* Change the start but reuse the all-ones buffer.  */
385		offtab->ot_window_start = (i * offtab->ot_window_size);
386		offtab_write_window(offtab);
387	}
388
389	/* Compute the number of bytes in the offset table.  */
390	__CTASSERT(MUL_OK(off_t, MAX_N_OFFSETS, sizeof(uint64_t)));
391	const off_t offtab_bytes = ((off_t)offtab->ot_n_offsets *
392	    sizeof(uint64_t));
393
394	/* Compute the offset of the first block.  */
395	assert(offtab->ot_fdpos <= OFFTAB_MAX_FDPOS);
396	__CTASSERT(ADD_OK(off_t, OFFTAB_MAX_FDPOS,
397		MAX_N_OFFSETS*sizeof(uint64_t)));
398	assert(ADD_OK(off_t, offtab->ot_fdpos, offtab_bytes));
399	const off_t first_offset = (offtab->ot_fdpos + offtab_bytes);
400
401	/* Assert that it fits in 64 bits.  */
402	__CTASSERT(MUL_OK(uint64_t, MAX_N_OFFSETS, sizeof(uint64_t)));
403	__CTASSERT(ADD_OK(uint64_t, OFFTAB_MAX_FDPOS,
404		(uint64_t)MAX_N_OFFSETS*sizeof(uint64_t)));
405
406	/* Write out the first window with the first offset.  */
407	offtab->ot_window_start = 0;
408	offtab->ot_window[0] = htobe64((uint64_t)first_offset);
409	offtab_write_window(offtab);
410
411	if (lseek(offtab->ot_fd, first_offset, SEEK_SET) == -1)
412		err(1, "lseek to first offset failed");
413}
414
415/*
416 * Guarantee that the disk reflects block offsets [0, n_offsets).  If
417 * OFFTAB_CHECKPOINT_SYNC is set in flags, will also fsync the entire
418 * offset table.  May fail; returns on success, aborts with err(3) on
419 * failure.  Fsync failure is considered success but is reported with a
420 * warning.
421 *
422 * This routine does not write state in memory, and does not read state
423 * that is not signal-safe.  The only state read is offtab->ot_window,
424 * offtab->ot_window_start, and quantities that are static for the
425 * signal-interruptable existence of the offset table.
426 */
427void
428offtab_checkpoint(struct offtab *offtab, uint32_t n_offsets, int flags)
429{
430
431	assert(offtab->ot_mode == OFFTAB_MODE_WRITE);
432	assert(n_offsets <= offtab->ot_n_offsets);
433
434	/*
435	 * Write the window unless we just did that and were
436	 * interrupted before we could move the window.
437	 */
438	if (offtab->ot_window != NULL)
439		offtab_maybe_write_window(offtab, 0, n_offsets);
440
441	if (ISSET(flags, OFFTAB_CHECKPOINT_SYNC)) {
442		__CTASSERT(MUL_OK(off_t, MAX_N_OFFSETS, sizeof(uint64_t)));
443		const off_t sync_bytes = ((off_t)n_offsets *
444		    (off_t)sizeof(uint64_t));
445		__CTASSERT(ADD_OK(off_t, OFFTAB_MAX_FDPOS,
446			MAX_N_OFFSETS*sizeof(uint64_t)));
447		assert(ADD_OK(off_t, offtab->ot_fdpos, sync_bytes));
448		if (fsync_range(offtab->ot_fd, (FFILESYNC | FDISKSYNC),
449			offtab->ot_fdpos, (offtab->ot_fdpos + sync_bytes))
450		    == -1)
451			warn_ss("fsync of offset table failed");
452	}
453}
454
455/*
456 * Do any I/O or bookkeeping necessary to set an offset for blkno.  May
457 * fail; returns on success, aborts with err(3) on failure.
458 */
459void
460offtab_prepare_put(struct offtab *offtab, uint32_t blkno)
461{
462	uint32_t i;
463
464	assert(offtab->ot_mode == OFFTAB_MODE_WRITE);
465	assert(blkno < offtab->ot_n_offsets);
466
467	/*
468	 * Assume, for convenience, that we write blocks in order.
469	 * Thus we need not do another read -- we can just clear the
470	 * window.
471	 */
472	assert((offtab->ot_blkno == (uint32_t)-1) ||
473	    ((offtab->ot_blkno + 1) == blkno));
474
475	/* If it's already in our window, we're good to go.  */
476	if ((offtab->ot_window_start <= blkno) &&
477	    (blkno < offtab_current_window_end(offtab)))
478		goto win;
479
480	/* Otherwise, write out the current window and choose a new one.  */
481	offtab_write_window(offtab);
482
483	assert(offtab->ot_window_size <= blkno);
484	assert(offtab->ot_window_start == (blkno - offtab->ot_window_size));
485	assert((offtab->ot_window_start + offtab->ot_window_size) ==
486	    rounddown(blkno, offtab->ot_window_size));
487
488    {
489	uint64_t *window;
490	sigset_t sigmask;
491
492	/*
493	 * Mark the window as being updated so nobody tries to write it
494	 * (since we just wrote it) while we fill it with ones.
495	 */
496	block_signals(&sigmask);
497	window = offtab->ot_window;
498	offtab->ot_window = NULL;
499	restore_sigmask(&sigmask);
500
501	/* Fill the window with ones.  */
502	for (i = 0; i < offtab_current_window_size(offtab); i++)
503		window[i] = ~(uint64_t)0;
504
505	/* Restore the window as ready again.  */
506	block_signals(&sigmask);
507	offtab->ot_window = window;
508	offtab->ot_window_start = rounddown(blkno, offtab->ot_window_size);
509	restore_sigmask(&sigmask);
510    }
511
512win:	assert(offtab->ot_window_start <= blkno);
513	assert(blkno < offtab_current_window_end(offtab));
514
515	offtab->ot_blkno = blkno;
516}
517
518/*
519 * Actually set the offset for blkno.
520 */
521void
522offtab_put(struct offtab *offtab, uint32_t blkno, uint64_t offset)
523{
524
525	assert(offtab->ot_mode == OFFTAB_MODE_WRITE);
526	assert(blkno == offtab->ot_blkno);
527	assert(offtab->ot_window_start <= blkno);
528	assert(blkno < offtab_current_window_end(offtab));
529
530	offtab->ot_window[blkno - offtab->ot_window_start] = htobe64(offset);
531}
532