1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26#include <sys/types.h>
27#include <sys/ksynch.h>
28#include <sys/kmem.h>
29#include <sys/errno.h>
30#include <sys/cmn_err.h>
31#include <sys/debug.h>
32#include <sys/cred.h>
33#include <sys/file.h>
34#include <sys/ddi.h>
35#include <sys/nsctl/nsctl.h>
36#include <sys/unistat/spcs_s.h>
37#include <sys/unistat/spcs_errors.h>
38
39#include <sys/unistat/spcs_s_k.h>
40#include "dsw.h"
41#include "dsw_dev.h"
42
43#ifdef DS_DDICT
44#include "../contract.h"
45#endif
46
47#include <sys/sdt.h>		/* dtrace is S10 or later */
48
49/*
50 * Instant Image.
51 *
52 * This file contains the chunk map lookup functions of II.
53 *
54 */
55#define	CHUNK_FBA(chunk) DSW_CHK2FBA(chunk)
56
57extern int ii_debug;	/* debug level switch */
58int ii_map_debug = 0;
59
60#ifdef II_MULTIMULTI_TERABYTE
61typedef	int64_t	nodeid_t;
62typedef	int32_t	nodeid32_t;
63#else
64typedef	int32_t	nodeid_t;
65#endif
66
67typedef struct	ii_node {
68	chunkid_t	vchunk_id;		/* virtual chunk id */
69} NODE;
70
71typedef struct ii_nodelink_s {
72	chunkid_t	next_chunk;
73} ii_nodelink_t;
74
75static	int	nodes_per_fba = FBA_SIZE(1) / sizeof (NODE);
76
77ii_header_t *_ii_bm_header_get(_ii_info_t *ip, nsc_buf_t **tmp);
78int _ii_bm_header_put(ii_header_t *hdr, _ii_info_t *ip,
79    nsc_buf_t *tmp);
80void _ii_rlse_devs(_ii_info_t *, int);
81int _ii_rsrv_devs(_ii_info_t *, int, int);
82void _ii_error(_ii_info_t *, int);
83/*
84 * Private functions for use in this file.
85 */
86static void free_node(_ii_info_t *ip, NODE *np, nodeid_t ni);
87static chunkid_t ii_alloc_overflow(_ii_info_t *ip);
88void ii_free_overflow(_ii_info_t *, chunkid_t);
89extern int _ii_nsc_io(_ii_info_t *, int, nsc_fd_t *, int, nsc_off_t,
90    unsigned char *, nsc_size_t);
91
92static int
93update_tree_header(_ii_info_t *ip)
94{
95	ii_header_t *header;
96	nsc_buf_t	*tmp = NULL;
97
98	mutex_enter(&ip->bi_mutex);
99	header = _ii_bm_header_get(ip, &tmp);
100	if (header == NULL) {
101		/* bitmap is probably offline */
102		mutex_exit(&ip->bi_mutex);
103		DTRACE_PROBE(_iit_update_tree_header_end);
104		return (1);
105	}
106	header->ii_mstchks = ip->bi_mstchks;
107	header->ii_shdchks = ip->bi_shdchks;
108	header->ii_shdchkused = ip->bi_shdchkused;
109	header->ii_shdfchk = ip->bi_shdfchk;
110	(void) _ii_bm_header_put(header, ip, tmp);
111	mutex_exit(&ip->bi_mutex);
112
113	return (0);
114}
115
116static int
117update_overflow_header(_ii_info_t *ip, _ii_overflow_t *op)
118{
119	(void) _ii_nsc_io(ip, KS_OVR, op->ii_dev->bi_fd, NSC_WRBUF,
120	    II_OHEADER_FBA, (unsigned char *)&(op->ii_do),
121	    sizeof (_ii_doverflow_t));
122
123	return (0);
124}
125
126static int
127node_io(_ii_info_t *ip, NODE *np, nodeid_t node, int flag)
128{
129	int	rc;
130	int	node_fba;
131	int	tree_fba = ip->bi_copyfba + (ip->bi_copyfba-ip->bi_shdfba);
132	int	offset;
133	nsc_buf_t *tmp = NULL;
134
135	/*
136	 * Don't use _ii_nsc_io() as _ii_nsc_io() requires io to start at
137	 * an fba boundary.
138	 */
139
140	/* calculate location of node on bitmap file */
141	offset = (node % nodes_per_fba) * sizeof (NODE);
142	node_fba = tree_fba + node / nodes_per_fba;
143
144	/* read disk block containing node */
145	rc = nsc_alloc_buf(ip->bi_bmpfd, node_fba, 1, NSC_RDBUF|flag, &tmp);
146	if (!II_SUCCESS(rc)) {
147		_ii_error(ip, DSW_BMPOFFLINE);
148		if (tmp)
149			(void) nsc_free_buf(tmp);
150
151		DTRACE_PROBE(_iit_node_io_end);
152		return (1);
153	}
154
155	/* copy node and update bitmap file if needed */
156	rc = 0;
157	if (flag == NSC_RDBUF)
158		bcopy(tmp->sb_vec->sv_addr+offset, np, sizeof (NODE));
159	else {
160		bcopy(np, tmp->sb_vec->sv_addr+offset, sizeof (NODE));
161		II_NSC_WRITE(ip, bitmap, rc, tmp, node_fba, 1, 0);
162		if (!II_SUCCESS(rc)) {
163			_ii_error(ip, DSW_BMPOFFLINE);
164			rc = EIO;
165		}
166	}
167	if (tmp)
168		(void) nsc_free_buf(tmp);
169
170	return (0);
171}
172
173static int
174node_fba_fill(_ii_info_t *ip, nsc_size_t nchunks, chunkid_t vchunk_id)
175{
176	int	rc;
177	nsc_off_t	fba;
178	nsc_size_t	fbas;
179	nsc_size_t	maxfbas;
180	nsc_buf_t *bp;
181	nsc_vec_t *vp;
182
183	/* Determine maximum number of FBAs to allocate */
184	rc =  nsc_maxfbas(ip->bi_bmpfd, 0, &maxfbas);
185	if (!II_SUCCESS(rc))
186		maxfbas = DSW_CBLK_FBA;
187
188	/* Write out blocks of initialied NODEs */
189	fba = ip->bi_copyfba + (ip->bi_copyfba-ip->bi_shdfba);
190	fbas = FBA_LEN(nchunks * sizeof (NODE));
191	while (fbas > 0) {
192
193		/* Determine number of FBA to allocate this time */
194		if (fbas < maxfbas) maxfbas = fbas;
195
196		/* Allocate buffer which map to FBAs containing NODEs */
197		bp = NULL;
198		rc = nsc_alloc_buf(ip->bi_bmpfd, fba, maxfbas, NSC_WRBUF, &bp);
199		if (!II_SUCCESS(rc)) {
200			_ii_error(ip, DSW_BMPOFFLINE);
201			DTRACE_PROBE(alloc_buf_failed);
202			return (EIO);
203		}
204
205		/* traverse vector list, filling wth initialized NODEs */
206		for (vp = bp->sb_vec; vp->sv_addr && vp->sv_len; vp++) {
207			NODE *pnode = (NODE *)vp->sv_addr;
208			NODE *enode = (NODE *)(vp->sv_addr +  vp->sv_len);
209			while (pnode < enode) {
210				pnode->vchunk_id = vchunk_id;
211				pnode++;
212			}
213		}
214
215		/* write FBAs containing initialized NODEs */
216		II_NSC_WRITE(ip, bitmap, rc, bp, fba, maxfbas, 0);
217		if (!II_SUCCESS(rc)) {
218			_ii_error(ip, DSW_BMPOFFLINE);
219			(void) nsc_free_buf(bp);
220			DTRACE_PROBE(write_failed);
221			return (EIO);
222		}
223
224		/* free the buffer */
225		(void) nsc_free_buf(bp);
226
227		/* Adjust nsc buffer values */
228		fba += maxfbas;
229		fbas -= maxfbas;
230	}
231
232	return (0);
233}
234
235/*
236 * Reads the node into core and returns a pointer to it.
237 */
238
239static NODE *
240read_node(_ii_info_t *ip, nodeid_t node)
241{
242	NODE *new;
243
244	new = (NODE *)kmem_alloc(sizeof (NODE), KM_SLEEP);
245
246	if (node_io(ip, new, node, NSC_RDBUF)) {
247		kmem_free(new, sizeof (NODE));
248		new = NULL;
249	}
250
251	return (new);
252}
253
254
255static chunkid_t
256alloc_chunk(_ii_info_t *ip)
257{
258	ii_nodelink_t nl;
259	int fba;
260	chunkid_t rc = II_NULLCHUNK;
261
262	mutex_enter(&ip->bi_chksmutex);
263	if (ip->bi_shdchkused < ip->bi_shdchks) {
264		rc = ip->bi_shdchkused++;
265	} else if (ip->bi_shdfchk != II_NULLCHUNK) {
266		ASSERT(ip->bi_shdfchk >= 0 && ip->bi_shdfchk < ip->bi_shdchks);
267		rc = ip->bi_shdfchk;
268		fba = CHUNK_FBA(rc);
269		(void) _ii_rsrv_devs(ip, SHDR, II_INTERNAL);
270		(void) _ii_nsc_io(ip, KS_SHD, SHDFD(ip), NSC_RDBUF, fba,
271		    (unsigned char *)&nl, sizeof (nl));
272		_ii_rlse_devs(ip, SHDR);
273		ip->bi_shdfchk = nl.next_chunk;
274		ASSERT(ip->bi_shdfchk == II_NULLCHUNK ||
275		    (ip->bi_shdfchk >= 0 && ip->bi_shdfchk < ip->bi_shdchks));
276	} else {
277
278		/* into overflow */
279		rc = ii_alloc_overflow(ip);
280	}
281	mutex_exit(&ip->bi_chksmutex);
282	(void) update_tree_header(ip);
283
284	return (rc);
285}
286
287/*
288 * releases memory for node
289 */
290static void	/*ARGSUSED*/
291release_node(_ii_info_t *ip, NODE *np, nodeid_t ni)
292{
293	kmem_free(np, sizeof (NODE));
294
295}
296
297static void
298write_node(_ii_info_t *ip, NODE *np, nodeid_t ni)
299{
300	(void) node_io(ip, np, ni, NSC_WRBUF);
301	release_node(ip, np, ni);
302
303}
304
305static void
306free_node(_ii_info_t *ip, NODE *np, nodeid_t ni)
307{
308	ii_nodelink_t nl;
309	int	fba;
310
311	if (np == NULL) {
312		DTRACE_PROBE(_iit_free_node_end);
313		return;
314	}
315
316	mutex_enter(&ip->bi_chksmutex);
317	if (II_ISOVERFLOW(np->vchunk_id)) {
318		/* link chunk onto overflow free list */
319		ii_free_overflow(ip, np->vchunk_id);
320	} else {
321		/* write old free list head into chunk */
322		nl.next_chunk = ip->bi_shdfchk;
323		ip->bi_shdfchk = np->vchunk_id;
324		ASSERT(ip->bi_shdfchk == II_NULLCHUNK ||
325		    (ip->bi_shdfchk >= 0 && ip->bi_shdfchk < ip->bi_shdchks));
326		fba = CHUNK_FBA(np->vchunk_id);
327		(void) _ii_rsrv_devs(ip, SHDR, II_INTERNAL);
328		(void) _ii_nsc_io(ip, KS_SHD, SHDFD(ip), NSC_WRBUF, fba,
329		    (unsigned char *)&nl, sizeof (nl));
330		_ii_rlse_devs(ip, SHDR);
331		/* update free counts */
332		/* ip->bi_unused++; */
333	}
334	np->vchunk_id = II_NULLCHUNK;
335	(void) node_io(ip, np, ni, NSC_WRBUF);
336	(void) update_tree_header(ip);
337	mutex_exit(&ip->bi_chksmutex);
338
339}
340
341/*
342 * Public functions for dsw_dev to use.
343 */
344
345/*
346 * Overflow volume functions.
347 */
348
349/* put overflow chunk on the overflow volume free list */
350void
351ii_free_overflow(_ii_info_t *ip, chunkid_t chunk)
352{
353	ii_nodelink_t nl;
354	_ii_overflow_t *op;
355	int fba;
356
357	if (!II_ISOVERFLOW(chunk)) {
358		DTRACE_PROBE(_iit_free_overflow_end_1);
359		return;
360	}
361	chunk = II_2OVERFLOW(chunk);
362
363	op = ip->bi_overflow;
364	if (op == NULL) {
365#ifdef DEBUG
366		cmn_err(CE_PANIC, "overflow used, but not attached ip %p",
367		    (void *) ip);
368#endif
369		DTRACE_PROBE(_iit_free_overflow_end_2);
370		return;
371	}
372	mutex_enter(&(op->ii_mutex));
373
374	DTRACE_PROBE(_iit_free_overflow);
375
376	/* write old free list head into chunk */
377	nl.next_chunk = op->ii_freehead;
378	fba = CHUNK_FBA(chunk);
379	(void) nsc_reserve(op->ii_dev->bi_fd, NSC_MULTI);
380	(void) _ii_nsc_io(ip, KS_OVR, op->ii_dev->bi_fd, NSC_WRBUF, fba,
381	    (unsigned char *)&nl, sizeof (nl));
382	/* update free counts */
383	op->ii_unused++;
384	ASSERT(op->ii_used > 0);		/* always use 1 for header */
385
386	/* write chunk id into header freelist start */
387	op->ii_freehead =  chunk;
388
389	(void) update_overflow_header(ip, op);
390	nsc_release(op->ii_dev->bi_fd);
391	mutex_exit(&(op->ii_mutex));
392
393}
394
395/* reclaim any overflow storage used by the volume */
396void
397ii_reclaim_overflow(_ii_info_t *ip)
398{
399	NODE	*node;
400	nodeid_t node_id;
401	_ii_overflow_t *op;
402
403	if ((ip->bi_flags & (DSW_VOVERFLOW | DSW_FRECLAIM)) == 0) {
404		DTRACE_PROBE(_iit_reclaim_overflow_end);
405		return;
406	}
407
408	/*
409	 * Determine whether overflow should be reclaimed:
410	 * 1/ If we're not doing a group volume update
411	 * OR
412	 * 2/ If the number of detaches != number of attached vols
413	 */
414	op = ip->bi_overflow;
415	if (op && (((op->ii_flags & IIO_VOL_UPDATE) == 0) ||
416	    (op->ii_detachcnt != op->ii_drefcnt))) {
417#ifndef II_MULTIMULTI_TERABYTE
418		/* assert volume size fits into node_id */
419		ASSERT(ip->bi_mstchks <= INT32_MAX);
420#endif
421		for (node_id = 0; node_id < ip->bi_mstchks; node_id++) {
422			if ((node = read_node(ip, node_id)) == NULL) {
423				DTRACE_PROBE(_iit_reclaim_overflow_end);
424				return;
425			}
426			ii_free_overflow(ip, node->vchunk_id);
427			release_node(ip, node, node_id);
428		}
429	} else {
430		/* need to reset the overflow volume header */
431		op->ii_freehead = II_NULLNODE;
432		op->ii_used = 1;		/* we have used the header */
433		op->ii_unused = op->ii_nchunks - op->ii_used;
434		(void) update_overflow_header(ip, op);
435	}
436
437	DTRACE_PROBE(_iit_reclaim_overflow);
438
439	if ((ip->bi_flags & DSW_VOVERFLOW) == DSW_VOVERFLOW) {
440		mutex_enter(&ip->bi_mutex);
441		II_FLAG_CLR(DSW_VOVERFLOW, ip);
442		mutex_exit(&ip->bi_mutex);
443	}
444	--iigkstat.spilled_over.value.ul;
445
446}
447
448static chunkid_t
449ii_alloc_overflow(_ii_info_t *ip)
450{
451	chunkid_t chunk;
452	ii_nodelink_t nl;
453	_ii_overflow_t *op;
454	int fba;
455
456	if ((op = ip->bi_overflow) == NULL) {
457		DTRACE_PROBE(_iit_alloc_overflow_end);
458		return (II_NULLCHUNK);	/* no overflow volume attached */
459	}
460
461	mutex_enter(&(op->ii_mutex));
462
463	DTRACE_PROBE(_iit_alloc_overflow);
464
465	if (op->ii_unused < 1) {
466		mutex_exit(&(op->ii_mutex));
467		DTRACE_PROBE(_iit_alloc_overflow_end);
468		return (II_NULLCHUNK);
469	}
470	(void) nsc_reserve(op->ii_dev->bi_fd, NSC_MULTI);
471	if (op->ii_freehead != II_NULLCHUNK) {
472		/* pick first from free list */
473		chunk = op->ii_freehead;
474		fba = CHUNK_FBA(chunk);
475		(void) _ii_nsc_io(ip, KS_OVR, op->ii_dev->bi_fd, NSC_RDBUF, fba,
476		    (unsigned char *)&nl, sizeof (nl));
477		op->ii_freehead = nl.next_chunk;
478		/* decrease unused count, fix bug 4419956 */
479		op->ii_unused--;
480	} else {
481		/* otherwise pick first unused */
482		if (op->ii_used > op->ii_nchunks)
483			chunk = II_NULLCHUNK;
484		else {
485			chunk = op->ii_used++;
486			op->ii_unused--;
487		}
488	}
489	if (chunk != II_NULLCHUNK) {
490		chunk = II_2OVERFLOW(chunk);
491		if ((ip->bi_flags&DSW_VOVERFLOW) == 0) {
492			mutex_enter(&ip->bi_mutex);
493			II_FLAG_SET(DSW_VOVERFLOW, ip);
494			mutex_exit(&ip->bi_mutex);
495			++iigkstat.spilled_over.value.ul;
496		}
497	}
498	(void) update_overflow_header(ip, op);
499	nsc_release(op->ii_dev->bi_fd);
500	mutex_exit(&(op->ii_mutex));
501
502	return (chunk);
503}
504/*
505 * Find or insert key into search tree.
506 */
507
508chunkid_t
509ii_tsearch(_ii_info_t *ip, chunkid_t chunk_id)
510			/* Address of the root of the tree */
511{
512	NODE	*rootp = NULL;
513	chunkid_t n;	/* New node id if key not found */
514
515	if ((rootp = read_node(ip, chunk_id)) == NULL) {
516		DTRACE_PROBE(_iit_tsearch_end);
517		return (II_NULLNODE);
518	}
519	n = rootp->vchunk_id;
520	if (n != II_NULLCHUNK) { /* chunk allocated, return location */
521		release_node(ip, rootp, 0);
522		DTRACE_PROBE(_iit_tsearch_end);
523		return (n);
524	}
525	n = alloc_chunk(ip);
526	if (n != II_NULLCHUNK) {
527		rootp->vchunk_id = n;
528		write_node(ip, rootp, chunk_id);
529	} else
530		release_node(ip, rootp, 0);
531
532	return (n);
533}
534
535/* Delete node with key chunkid */
536void
537ii_tdelete(_ii_info_t *ip,
538	chunkid_t chunkid)	/* Key to be deleted */
539{
540	NODE *np = NULL;
541
542	if ((np = read_node(ip, chunkid)) == NULL) {
543		DTRACE_PROBE(_iit_tdelete_end);
544		return;
545	}
546
547	ASSERT(np->vchunk_id != II_NULLCHUNK);
548	free_node(ip, np, chunkid);
549	np->vchunk_id = II_NULLCHUNK;
550	write_node(ip, np, chunkid);
551
552}
553
554/*
555 * initialise an empty map for ip
556 */
557
558int
559ii_tinit(_ii_info_t *ip)
560{
561	int rc = 0;
562
563	/* overflow can't be attached before first call to this function */
564	if (ip->bi_overflow)
565		ii_reclaim_overflow(ip);
566
567	mutex_enter(&ip->bi_chksmutex);
568	ip->bi_shdfchk = II_NULLCHUNK;	/* set freelist to empty chain */
569	ip->bi_shdchkused = 0;
570
571	/* fill index (bi_mstchks size) with II_NULLCHUNK */
572	rc = node_fba_fill(ip, ip->bi_mstchks, II_NULLCHUNK);
573	if (rc == 0)
574		rc = update_tree_header(ip);
575	mutex_exit(&ip->bi_chksmutex);
576
577	return (rc);
578}
579
580/*
581 * Calculate the size of map space provided by a bitmap volume with
582 * tree_len fba's spare for the tree.
583 */
584
585nsc_size_t
586ii_btsize(nsc_size_t tree_len)
587{
588	nsc_size_t nchunks;
589
590	nchunks = tree_len * nodes_per_fba;
591
592	if (ii_debug > 1)
593		cmn_err(CE_NOTE,
594		    "!ii_btsize: bitmap with %" NSC_SZFMT
595		    " spare fba's will map %" NSC_SZFMT " chunks",
596		    tree_len, nchunks);
597
598	return (nchunks);
599}
600