PageRenderTime 105ms CodeModel.GetById 3ms app.highlight 89ms RepoModel.GetById 1ms app.codeStats 1ms

/usr/src/uts/common/fs/zfs/zvol.c

https://bitbucket.org/0xffea/illumos-dccp
C | 1999 lines | 1488 code | 283 blank | 228 comment | 325 complexity | 5571b06c47f33fa8d8cf2c66f0c3a155 MD5 | raw file
   1/*
   2 * CDDL HEADER START
   3 *
   4 * The contents of this file are subject to the terms of the
   5 * Common Development and Distribution License (the "License").
   6 * You may not use this file except in compliance with the License.
   7 *
   8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9 * or http://www.opensolaris.org/os/licensing.
  10 * See the License for the specific language governing permissions
  11 * and limitations under the License.
  12 *
  13 * When distributing Covered Code, include this CDDL HEADER in each
  14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15 * If applicable, add the following below this CDDL HEADER, with the
  16 * fields enclosed by brackets "[]" replaced with your own identifying
  17 * information: Portions Copyright [yyyy] [name of copyright owner]
  18 *
  19 * CDDL HEADER END
  20 */
  21/*
  22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23 *
  24 * Portions Copyright 2010 Robert Milkowski
  25 *
  26 * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  27 * Copyright (c) 2012 by Delphix. All rights reserved.
  28 */
  29
  30/*
  31 * ZFS volume emulation driver.
  32 *
  33 * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
  34 * Volumes are accessed through the symbolic links named:
  35 *
  36 * /dev/zvol/dsk/<pool_name>/<dataset_name>
  37 * /dev/zvol/rdsk/<pool_name>/<dataset_name>
  38 *
  39 * These links are created by the /dev filesystem (sdev_zvolops.c).
  40 * Volumes are persistent through reboot.  No user command needs to be
  41 * run before opening and using a device.
  42 */
  43
  44#include <sys/types.h>
  45#include <sys/param.h>
  46#include <sys/errno.h>
  47#include <sys/uio.h>
  48#include <sys/buf.h>
  49#include <sys/modctl.h>
  50#include <sys/open.h>
  51#include <sys/kmem.h>
  52#include <sys/conf.h>
  53#include <sys/cmn_err.h>
  54#include <sys/stat.h>
  55#include <sys/zap.h>
  56#include <sys/spa.h>
  57#include <sys/zio.h>
  58#include <sys/dmu_traverse.h>
  59#include <sys/dnode.h>
  60#include <sys/dsl_dataset.h>
  61#include <sys/dsl_prop.h>
  62#include <sys/dkio.h>
  63#include <sys/efi_partition.h>
  64#include <sys/byteorder.h>
  65#include <sys/pathname.h>
  66#include <sys/ddi.h>
  67#include <sys/sunddi.h>
  68#include <sys/crc32.h>
  69#include <sys/dirent.h>
  70#include <sys/policy.h>
  71#include <sys/fs/zfs.h>
  72#include <sys/zfs_ioctl.h>
  73#include <sys/mkdev.h>
  74#include <sys/zil.h>
  75#include <sys/refcount.h>
  76#include <sys/zfs_znode.h>
  77#include <sys/zfs_rlock.h>
  78#include <sys/vdev_disk.h>
  79#include <sys/vdev_impl.h>
  80#include <sys/zvol.h>
  81#include <sys/dumphdr.h>
  82#include <sys/zil_impl.h>
  83
  84#include "zfs_namecheck.h"
  85
  86void *zfsdev_state;
  87static char *zvol_tag = "zvol_tag";
  88
  89#define	ZVOL_DUMPSIZE		"dumpsize"
  90
  91/*
  92 * This lock protects the zfsdev_state structure from being modified
  93 * while it's being used, e.g. an open that comes in before a create
  94 * finishes.  It also protects temporary opens of the dataset so that,
  95 * e.g., an open doesn't get a spurious EBUSY.
  96 */
  97kmutex_t zfsdev_state_lock;
  98static uint32_t zvol_minors;
  99
 100typedef struct zvol_extent {
 101	list_node_t	ze_node;
 102	dva_t		ze_dva;		/* dva associated with this extent */
 103	uint64_t	ze_nblks;	/* number of blocks in extent */
 104} zvol_extent_t;
 105
 106/*
 107 * The in-core state of each volume.
 108 */
 109typedef struct zvol_state {
 110	char		zv_name[MAXPATHLEN]; /* pool/dd name */
 111	uint64_t	zv_volsize;	/* amount of space we advertise */
 112	uint64_t	zv_volblocksize; /* volume block size */
 113	minor_t		zv_minor;	/* minor number */
 114	uint8_t		zv_min_bs;	/* minimum addressable block shift */
 115	uint8_t		zv_flags;	/* readonly, dumpified, etc. */
 116	objset_t	*zv_objset;	/* objset handle */
 117	uint32_t	zv_open_count[OTYPCNT];	/* open counts */
 118	uint32_t	zv_total_opens;	/* total open count */
 119	zilog_t		*zv_zilog;	/* ZIL handle */
 120	list_t		zv_extents;	/* List of extents for dump */
 121	znode_t		zv_znode;	/* for range locking */
 122	dmu_buf_t	*zv_dbuf;	/* bonus handle */
 123} zvol_state_t;
 124
 125/*
 126 * zvol specific flags
 127 */
 128#define	ZVOL_RDONLY	0x1
 129#define	ZVOL_DUMPIFIED	0x2
 130#define	ZVOL_EXCL	0x4
 131#define	ZVOL_WCE	0x8
 132
 133/*
 134 * zvol maximum transfer in one DMU tx.
 135 */
 136int zvol_maxphys = DMU_MAX_ACCESS/2;
 137
 138extern int zfs_set_prop_nvlist(const char *, zprop_source_t,
 139    nvlist_t *, nvlist_t *);
 140static int zvol_remove_zv(zvol_state_t *);
 141static int zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio);
 142static int zvol_dumpify(zvol_state_t *zv);
 143static int zvol_dump_fini(zvol_state_t *zv);
 144static int zvol_dump_init(zvol_state_t *zv, boolean_t resize);
 145
 146static void
 147zvol_size_changed(uint64_t volsize, major_t maj, minor_t min)
 148{
 149	dev_t dev = makedevice(maj, min);
 150
 151	VERIFY(ddi_prop_update_int64(dev, zfs_dip,
 152	    "Size", volsize) == DDI_SUCCESS);
 153	VERIFY(ddi_prop_update_int64(dev, zfs_dip,
 154	    "Nblocks", lbtodb(volsize)) == DDI_SUCCESS);
 155
 156	/* Notify specfs to invalidate the cached size */
 157	spec_size_invalidate(dev, VBLK);
 158	spec_size_invalidate(dev, VCHR);
 159}
 160
 161int
 162zvol_check_volsize(uint64_t volsize, uint64_t blocksize)
 163{
 164	if (volsize == 0)
 165		return (EINVAL);
 166
 167	if (volsize % blocksize != 0)
 168		return (EINVAL);
 169
 170#ifdef _ILP32
 171	if (volsize - 1 > SPEC_MAXOFFSET_T)
 172		return (EOVERFLOW);
 173#endif
 174	return (0);
 175}
 176
 177int
 178zvol_check_volblocksize(uint64_t volblocksize)
 179{
 180	if (volblocksize < SPA_MINBLOCKSIZE ||
 181	    volblocksize > SPA_MAXBLOCKSIZE ||
 182	    !ISP2(volblocksize))
 183		return (EDOM);
 184
 185	return (0);
 186}
 187
 188int
 189zvol_get_stats(objset_t *os, nvlist_t *nv)
 190{
 191	int error;
 192	dmu_object_info_t doi;
 193	uint64_t val;
 194
 195	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &val);
 196	if (error)
 197		return (error);
 198
 199	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLSIZE, val);
 200
 201	error = dmu_object_info(os, ZVOL_OBJ, &doi);
 202
 203	if (error == 0) {
 204		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLBLOCKSIZE,
 205		    doi.doi_data_block_size);
 206	}
 207
 208	return (error);
 209}
 210
 211static zvol_state_t *
 212zvol_minor_lookup(const char *name)
 213{
 214	minor_t minor;
 215	zvol_state_t *zv;
 216
 217	ASSERT(MUTEX_HELD(&zfsdev_state_lock));
 218
 219	for (minor = 1; minor <= ZFSDEV_MAX_MINOR; minor++) {
 220		zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
 221		if (zv == NULL)
 222			continue;
 223		if (strcmp(zv->zv_name, name) == 0)
 224			return (zv);
 225	}
 226
 227	return (NULL);
 228}
 229
 230/* extent mapping arg */
 231struct maparg {
 232	zvol_state_t	*ma_zv;
 233	uint64_t	ma_blks;
 234};
 235
 236/*ARGSUSED*/
 237static int
 238zvol_map_block(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf,
 239    const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
 240{
 241	struct maparg *ma = arg;
 242	zvol_extent_t *ze;
 243	int bs = ma->ma_zv->zv_volblocksize;
 244
 245	if (bp == NULL || zb->zb_object != ZVOL_OBJ || zb->zb_level != 0)
 246		return (0);
 247
 248	VERIFY3U(ma->ma_blks, ==, zb->zb_blkid);
 249	ma->ma_blks++;
 250
 251	/* Abort immediately if we have encountered gang blocks */
 252	if (BP_IS_GANG(bp))
 253		return (EFRAGS);
 254
 255	/*
 256	 * See if the block is at the end of the previous extent.
 257	 */
 258	ze = list_tail(&ma->ma_zv->zv_extents);
 259	if (ze &&
 260	    DVA_GET_VDEV(BP_IDENTITY(bp)) == DVA_GET_VDEV(&ze->ze_dva) &&
 261	    DVA_GET_OFFSET(BP_IDENTITY(bp)) ==
 262	    DVA_GET_OFFSET(&ze->ze_dva) + ze->ze_nblks * bs) {
 263		ze->ze_nblks++;
 264		return (0);
 265	}
 266
 267	dprintf_bp(bp, "%s", "next blkptr:");
 268
 269	/* start a new extent */
 270	ze = kmem_zalloc(sizeof (zvol_extent_t), KM_SLEEP);
 271	ze->ze_dva = bp->blk_dva[0];	/* structure assignment */
 272	ze->ze_nblks = 1;
 273	list_insert_tail(&ma->ma_zv->zv_extents, ze);
 274	return (0);
 275}
 276
 277static void
 278zvol_free_extents(zvol_state_t *zv)
 279{
 280	zvol_extent_t *ze;
 281
 282	while (ze = list_head(&zv->zv_extents)) {
 283		list_remove(&zv->zv_extents, ze);
 284		kmem_free(ze, sizeof (zvol_extent_t));
 285	}
 286}
 287
 288static int
 289zvol_get_lbas(zvol_state_t *zv)
 290{
 291	objset_t *os = zv->zv_objset;
 292	struct maparg	ma;
 293	int		err;
 294
 295	ma.ma_zv = zv;
 296	ma.ma_blks = 0;
 297	zvol_free_extents(zv);
 298
 299	/* commit any in-flight changes before traversing the dataset */
 300	txg_wait_synced(dmu_objset_pool(os), 0);
 301	err = traverse_dataset(dmu_objset_ds(os), 0,
 302	    TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, zvol_map_block, &ma);
 303	if (err || ma.ma_blks != (zv->zv_volsize / zv->zv_volblocksize)) {
 304		zvol_free_extents(zv);
 305		return (err ? err : EIO);
 306	}
 307
 308	return (0);
 309}
 310
 311/* ARGSUSED */
 312void
 313zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx)
 314{
 315	zfs_creat_t *zct = arg;
 316	nvlist_t *nvprops = zct->zct_props;
 317	int error;
 318	uint64_t volblocksize, volsize;
 319
 320	VERIFY(nvlist_lookup_uint64(nvprops,
 321	    zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) == 0);
 322	if (nvlist_lookup_uint64(nvprops,
 323	    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize) != 0)
 324		volblocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE);
 325
 326	/*
 327	 * These properties must be removed from the list so the generic
 328	 * property setting step won't apply to them.
 329	 */
 330	VERIFY(nvlist_remove_all(nvprops,
 331	    zfs_prop_to_name(ZFS_PROP_VOLSIZE)) == 0);
 332	(void) nvlist_remove_all(nvprops,
 333	    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE));
 334
 335	error = dmu_object_claim(os, ZVOL_OBJ, DMU_OT_ZVOL, volblocksize,
 336	    DMU_OT_NONE, 0, tx);
 337	ASSERT(error == 0);
 338
 339	error = zap_create_claim(os, ZVOL_ZAP_OBJ, DMU_OT_ZVOL_PROP,
 340	    DMU_OT_NONE, 0, tx);
 341	ASSERT(error == 0);
 342
 343	error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize, tx);
 344	ASSERT(error == 0);
 345}
 346
 347/*
 348 * Replay a TX_TRUNCATE ZIL transaction if asked.  TX_TRUNCATE is how we
 349 * implement DKIOCFREE/free-long-range.
 350 */
 351static int
 352zvol_replay_truncate(zvol_state_t *zv, lr_truncate_t *lr, boolean_t byteswap)
 353{
 354	uint64_t offset, length;
 355
 356	if (byteswap)
 357		byteswap_uint64_array(lr, sizeof (*lr));
 358
 359	offset = lr->lr_offset;
 360	length = lr->lr_length;
 361
 362	return (dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, offset, length));
 363}
 364
 365/*
 366 * Replay a TX_WRITE ZIL transaction that didn't get committed
 367 * after a system failure
 368 */
 369static int
 370zvol_replay_write(zvol_state_t *zv, lr_write_t *lr, boolean_t byteswap)
 371{
 372	objset_t *os = zv->zv_objset;
 373	char *data = (char *)(lr + 1);	/* data follows lr_write_t */
 374	uint64_t offset, length;
 375	dmu_tx_t *tx;
 376	int error;
 377
 378	if (byteswap)
 379		byteswap_uint64_array(lr, sizeof (*lr));
 380
 381	offset = lr->lr_offset;
 382	length = lr->lr_length;
 383
 384	/* If it's a dmu_sync() block, write the whole block */
 385	if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
 386		uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr);
 387		if (length < blocksize) {
 388			offset -= offset % blocksize;
 389			length = blocksize;
 390		}
 391	}
 392
 393	tx = dmu_tx_create(os);
 394	dmu_tx_hold_write(tx, ZVOL_OBJ, offset, length);
 395	error = dmu_tx_assign(tx, TXG_WAIT);
 396	if (error) {
 397		dmu_tx_abort(tx);
 398	} else {
 399		dmu_write(os, ZVOL_OBJ, offset, length, data, tx);
 400		dmu_tx_commit(tx);
 401	}
 402
 403	return (error);
 404}
 405
 406/* ARGSUSED */
 407static int
 408zvol_replay_err(zvol_state_t *zv, lr_t *lr, boolean_t byteswap)
 409{
 410	return (ENOTSUP);
 411}
 412
 413/*
 414 * Callback vectors for replaying records.
 415 * Only TX_WRITE and TX_TRUNCATE are needed for zvol.
 416 */
 417zil_replay_func_t *zvol_replay_vector[TX_MAX_TYPE] = {
 418	zvol_replay_err,	/* 0 no such transaction type */
 419	zvol_replay_err,	/* TX_CREATE */
 420	zvol_replay_err,	/* TX_MKDIR */
 421	zvol_replay_err,	/* TX_MKXATTR */
 422	zvol_replay_err,	/* TX_SYMLINK */
 423	zvol_replay_err,	/* TX_REMOVE */
 424	zvol_replay_err,	/* TX_RMDIR */
 425	zvol_replay_err,	/* TX_LINK */
 426	zvol_replay_err,	/* TX_RENAME */
 427	zvol_replay_write,	/* TX_WRITE */
 428	zvol_replay_truncate,	/* TX_TRUNCATE */
 429	zvol_replay_err,	/* TX_SETATTR */
 430	zvol_replay_err,	/* TX_ACL */
 431	zvol_replay_err,	/* TX_CREATE_ACL */
 432	zvol_replay_err,	/* TX_CREATE_ATTR */
 433	zvol_replay_err,	/* TX_CREATE_ACL_ATTR */
 434	zvol_replay_err,	/* TX_MKDIR_ACL */
 435	zvol_replay_err,	/* TX_MKDIR_ATTR */
 436	zvol_replay_err,	/* TX_MKDIR_ACL_ATTR */
 437	zvol_replay_err,	/* TX_WRITE2 */
 438};
 439
 440int
 441zvol_name2minor(const char *name, minor_t *minor)
 442{
 443	zvol_state_t *zv;
 444
 445	mutex_enter(&zfsdev_state_lock);
 446	zv = zvol_minor_lookup(name);
 447	if (minor && zv)
 448		*minor = zv->zv_minor;
 449	mutex_exit(&zfsdev_state_lock);
 450	return (zv ? 0 : -1);
 451}
 452
 453/*
 454 * Create a minor node (plus a whole lot more) for the specified volume.
 455 */
 456int
 457zvol_create_minor(const char *name)
 458{
 459	zfs_soft_state_t *zs;
 460	zvol_state_t *zv;
 461	objset_t *os;
 462	dmu_object_info_t doi;
 463	minor_t minor = 0;
 464	char chrbuf[30], blkbuf[30];
 465	int error;
 466
 467	mutex_enter(&zfsdev_state_lock);
 468
 469	if (zvol_minor_lookup(name) != NULL) {
 470		mutex_exit(&zfsdev_state_lock);
 471		return (EEXIST);
 472	}
 473
 474	/* lie and say we're read-only */
 475	error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, FTAG, &os);
 476
 477	if (error) {
 478		mutex_exit(&zfsdev_state_lock);
 479		return (error);
 480	}
 481
 482	if ((minor = zfsdev_minor_alloc()) == 0) {
 483		dmu_objset_disown(os, FTAG);
 484		mutex_exit(&zfsdev_state_lock);
 485		return (ENXIO);
 486	}
 487
 488	if (ddi_soft_state_zalloc(zfsdev_state, minor) != DDI_SUCCESS) {
 489		dmu_objset_disown(os, FTAG);
 490		mutex_exit(&zfsdev_state_lock);
 491		return (EAGAIN);
 492	}
 493	(void) ddi_prop_update_string(minor, zfs_dip, ZVOL_PROP_NAME,
 494	    (char *)name);
 495
 496	(void) snprintf(chrbuf, sizeof (chrbuf), "%u,raw", minor);
 497
 498	if (ddi_create_minor_node(zfs_dip, chrbuf, S_IFCHR,
 499	    minor, DDI_PSEUDO, 0) == DDI_FAILURE) {
 500		ddi_soft_state_free(zfsdev_state, minor);
 501		dmu_objset_disown(os, FTAG);
 502		mutex_exit(&zfsdev_state_lock);
 503		return (EAGAIN);
 504	}
 505
 506	(void) snprintf(blkbuf, sizeof (blkbuf), "%u", minor);
 507
 508	if (ddi_create_minor_node(zfs_dip, blkbuf, S_IFBLK,
 509	    minor, DDI_PSEUDO, 0) == DDI_FAILURE) {
 510		ddi_remove_minor_node(zfs_dip, chrbuf);
 511		ddi_soft_state_free(zfsdev_state, minor);
 512		dmu_objset_disown(os, FTAG);
 513		mutex_exit(&zfsdev_state_lock);
 514		return (EAGAIN);
 515	}
 516
 517	zs = ddi_get_soft_state(zfsdev_state, minor);
 518	zs->zss_type = ZSST_ZVOL;
 519	zv = zs->zss_data = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP);
 520	(void) strlcpy(zv->zv_name, name, MAXPATHLEN);
 521	zv->zv_min_bs = DEV_BSHIFT;
 522	zv->zv_minor = minor;
 523	zv->zv_objset = os;
 524	if (dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os)))
 525		zv->zv_flags |= ZVOL_RDONLY;
 526	mutex_init(&zv->zv_znode.z_range_lock, NULL, MUTEX_DEFAULT, NULL);
 527	avl_create(&zv->zv_znode.z_range_avl, zfs_range_compare,
 528	    sizeof (rl_t), offsetof(rl_t, r_node));
 529	list_create(&zv->zv_extents, sizeof (zvol_extent_t),
 530	    offsetof(zvol_extent_t, ze_node));
 531	/* get and cache the blocksize */
 532	error = dmu_object_info(os, ZVOL_OBJ, &doi);
 533	ASSERT(error == 0);
 534	zv->zv_volblocksize = doi.doi_data_block_size;
 535
 536	if (spa_writeable(dmu_objset_spa(os))) {
 537		if (zil_replay_disable)
 538			zil_destroy(dmu_objset_zil(os), B_FALSE);
 539		else
 540			zil_replay(os, zv, zvol_replay_vector);
 541	}
 542	dmu_objset_disown(os, FTAG);
 543	zv->zv_objset = NULL;
 544
 545	zvol_minors++;
 546
 547	mutex_exit(&zfsdev_state_lock);
 548
 549	return (0);
 550}
 551
 552/*
 553 * Remove minor node for the specified volume.
 554 */
 555static int
 556zvol_remove_zv(zvol_state_t *zv)
 557{
 558	char nmbuf[20];
 559	minor_t minor = zv->zv_minor;
 560
 561	ASSERT(MUTEX_HELD(&zfsdev_state_lock));
 562	if (zv->zv_total_opens != 0)
 563		return (EBUSY);
 564
 565	(void) snprintf(nmbuf, sizeof (nmbuf), "%u,raw", minor);
 566	ddi_remove_minor_node(zfs_dip, nmbuf);
 567
 568	(void) snprintf(nmbuf, sizeof (nmbuf), "%u", minor);
 569	ddi_remove_minor_node(zfs_dip, nmbuf);
 570
 571	avl_destroy(&zv->zv_znode.z_range_avl);
 572	mutex_destroy(&zv->zv_znode.z_range_lock);
 573
 574	kmem_free(zv, sizeof (zvol_state_t));
 575
 576	ddi_soft_state_free(zfsdev_state, minor);
 577
 578	zvol_minors--;
 579	return (0);
 580}
 581
 582int
 583zvol_remove_minor(const char *name)
 584{
 585	zvol_state_t *zv;
 586	int rc;
 587
 588	mutex_enter(&zfsdev_state_lock);
 589	if ((zv = zvol_minor_lookup(name)) == NULL) {
 590		mutex_exit(&zfsdev_state_lock);
 591		return (ENXIO);
 592	}
 593	rc = zvol_remove_zv(zv);
 594	mutex_exit(&zfsdev_state_lock);
 595	return (rc);
 596}
 597
 598int
 599zvol_first_open(zvol_state_t *zv)
 600{
 601	objset_t *os;
 602	uint64_t volsize;
 603	int error;
 604	uint64_t readonly;
 605
 606	/* lie and say we're read-only */
 607	error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, B_TRUE,
 608	    zvol_tag, &os);
 609	if (error)
 610		return (error);
 611
 612	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
 613	if (error) {
 614		ASSERT(error == 0);
 615		dmu_objset_disown(os, zvol_tag);
 616		return (error);
 617	}
 618	zv->zv_objset = os;
 619	error = dmu_bonus_hold(os, ZVOL_OBJ, zvol_tag, &zv->zv_dbuf);
 620	if (error) {
 621		dmu_objset_disown(os, zvol_tag);
 622		return (error);
 623	}
 624	zv->zv_volsize = volsize;
 625	zv->zv_zilog = zil_open(os, zvol_get_data);
 626	zvol_size_changed(zv->zv_volsize, ddi_driver_major(zfs_dip),
 627	    zv->zv_minor);
 628
 629	VERIFY(dsl_prop_get_integer(zv->zv_name, "readonly", &readonly,
 630	    NULL) == 0);
 631	if (readonly || dmu_objset_is_snapshot(os) ||
 632	    !spa_writeable(dmu_objset_spa(os)))
 633		zv->zv_flags |= ZVOL_RDONLY;
 634	else
 635		zv->zv_flags &= ~ZVOL_RDONLY;
 636	return (error);
 637}
 638
 639void
 640zvol_last_close(zvol_state_t *zv)
 641{
 642	zil_close(zv->zv_zilog);
 643	zv->zv_zilog = NULL;
 644	dmu_buf_rele(zv->zv_dbuf, zvol_tag);
 645	zv->zv_dbuf = NULL;
 646	dmu_objset_disown(zv->zv_objset, zvol_tag);
 647	zv->zv_objset = NULL;
 648}
 649
 650int
 651zvol_prealloc(zvol_state_t *zv)
 652{
 653	objset_t *os = zv->zv_objset;
 654	dmu_tx_t *tx;
 655	uint64_t refd, avail, usedobjs, availobjs;
 656	uint64_t resid = zv->zv_volsize;
 657	uint64_t off = 0;
 658
 659	/* Check the space usage before attempting to allocate the space */
 660	dmu_objset_space(os, &refd, &avail, &usedobjs, &availobjs);
 661	if (avail < zv->zv_volsize)
 662		return (ENOSPC);
 663
 664	/* Free old extents if they exist */
 665	zvol_free_extents(zv);
 666
 667	while (resid != 0) {
 668		int error;
 669		uint64_t bytes = MIN(resid, SPA_MAXBLOCKSIZE);
 670
 671		tx = dmu_tx_create(os);
 672		dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes);
 673		error = dmu_tx_assign(tx, TXG_WAIT);
 674		if (error) {
 675			dmu_tx_abort(tx);
 676			(void) dmu_free_long_range(os, ZVOL_OBJ, 0, off);
 677			return (error);
 678		}
 679		dmu_prealloc(os, ZVOL_OBJ, off, bytes, tx);
 680		dmu_tx_commit(tx);
 681		off += bytes;
 682		resid -= bytes;
 683	}
 684	txg_wait_synced(dmu_objset_pool(os), 0);
 685
 686	return (0);
 687}
 688
 689int
 690zvol_update_volsize(objset_t *os, uint64_t volsize)
 691{
 692	dmu_tx_t *tx;
 693	int error;
 694
 695	ASSERT(MUTEX_HELD(&zfsdev_state_lock));
 696
 697	tx = dmu_tx_create(os);
 698	dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
 699	error = dmu_tx_assign(tx, TXG_WAIT);
 700	if (error) {
 701		dmu_tx_abort(tx);
 702		return (error);
 703	}
 704
 705	error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1,
 706	    &volsize, tx);
 707	dmu_tx_commit(tx);
 708
 709	if (error == 0)
 710		error = dmu_free_long_range(os,
 711		    ZVOL_OBJ, volsize, DMU_OBJECT_END);
 712	return (error);
 713}
 714
 715void
 716zvol_remove_minors(const char *name)
 717{
 718	zvol_state_t *zv;
 719	char *namebuf;
 720	minor_t minor;
 721
 722	namebuf = kmem_zalloc(strlen(name) + 2, KM_SLEEP);
 723	(void) strncpy(namebuf, name, strlen(name));
 724	(void) strcat(namebuf, "/");
 725	mutex_enter(&zfsdev_state_lock);
 726	for (minor = 1; minor <= ZFSDEV_MAX_MINOR; minor++) {
 727
 728		zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
 729		if (zv == NULL)
 730			continue;
 731		if (strncmp(namebuf, zv->zv_name, strlen(namebuf)) == 0)
 732			(void) zvol_remove_zv(zv);
 733	}
 734	kmem_free(namebuf, strlen(name) + 2);
 735
 736	mutex_exit(&zfsdev_state_lock);
 737}
 738
 739int
 740zvol_set_volsize(const char *name, major_t maj, uint64_t volsize)
 741{
 742	zvol_state_t *zv = NULL;
 743	objset_t *os;
 744	int error;
 745	dmu_object_info_t doi;
 746	uint64_t old_volsize = 0ULL;
 747	uint64_t readonly;
 748
 749	mutex_enter(&zfsdev_state_lock);
 750	zv = zvol_minor_lookup(name);
 751	if ((error = dmu_objset_hold(name, FTAG, &os)) != 0) {
 752		mutex_exit(&zfsdev_state_lock);
 753		return (error);
 754	}
 755
 756	if ((error = dmu_object_info(os, ZVOL_OBJ, &doi)) != 0 ||
 757	    (error = zvol_check_volsize(volsize,
 758	    doi.doi_data_block_size)) != 0)
 759		goto out;
 760
 761	VERIFY(dsl_prop_get_integer(name, "readonly", &readonly,
 762	    NULL) == 0);
 763	if (readonly) {
 764		error = EROFS;
 765		goto out;
 766	}
 767
 768	error = zvol_update_volsize(os, volsize);
 769	/*
 770	 * Reinitialize the dump area to the new size. If we
 771	 * failed to resize the dump area then restore it back to
 772	 * its original size.
 773	 */
 774	if (zv && error == 0) {
 775		if (zv->zv_flags & ZVOL_DUMPIFIED) {
 776			old_volsize = zv->zv_volsize;
 777			zv->zv_volsize = volsize;
 778			if ((error = zvol_dumpify(zv)) != 0 ||
 779			    (error = dumpvp_resize()) != 0) {
 780				(void) zvol_update_volsize(os, old_volsize);
 781				zv->zv_volsize = old_volsize;
 782				error = zvol_dumpify(zv);
 783			}
 784		}
 785		if (error == 0) {
 786			zv->zv_volsize = volsize;
 787			zvol_size_changed(volsize, maj, zv->zv_minor);
 788		}
 789	}
 790
 791	/*
 792	 * Generate a LUN expansion event.
 793	 */
 794	if (zv && error == 0) {
 795		sysevent_id_t eid;
 796		nvlist_t *attr;
 797		char *physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
 798
 799		(void) snprintf(physpath, MAXPATHLEN, "%s%u", ZVOL_PSEUDO_DEV,
 800		    zv->zv_minor);
 801
 802		VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 803		VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0);
 804
 805		(void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS,
 806		    ESC_DEV_DLE, attr, &eid, DDI_SLEEP);
 807
 808		nvlist_free(attr);
 809		kmem_free(physpath, MAXPATHLEN);
 810	}
 811
 812out:
 813	dmu_objset_rele(os, FTAG);
 814
 815	mutex_exit(&zfsdev_state_lock);
 816
 817	return (error);
 818}
 819
 820/*ARGSUSED*/
 821int
 822zvol_open(dev_t *devp, int flag, int otyp, cred_t *cr)
 823{
 824	zvol_state_t *zv;
 825	int err = 0;
 826
 827	mutex_enter(&zfsdev_state_lock);
 828
 829	zv = zfsdev_get_soft_state(getminor(*devp), ZSST_ZVOL);
 830	if (zv == NULL) {
 831		mutex_exit(&zfsdev_state_lock);
 832		return (ENXIO);
 833	}
 834
 835	if (zv->zv_total_opens == 0)
 836		err = zvol_first_open(zv);
 837	if (err) {
 838		mutex_exit(&zfsdev_state_lock);
 839		return (err);
 840	}
 841	if ((flag & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
 842		err = EROFS;
 843		goto out;
 844	}
 845	if (zv->zv_flags & ZVOL_EXCL) {
 846		err = EBUSY;
 847		goto out;
 848	}
 849	if (flag & FEXCL) {
 850		if (zv->zv_total_opens != 0) {
 851			err = EBUSY;
 852			goto out;
 853		}
 854		zv->zv_flags |= ZVOL_EXCL;
 855	}
 856
 857	if (zv->zv_open_count[otyp] == 0 || otyp == OTYP_LYR) {
 858		zv->zv_open_count[otyp]++;
 859		zv->zv_total_opens++;
 860	}
 861	mutex_exit(&zfsdev_state_lock);
 862
 863	return (err);
 864out:
 865	if (zv->zv_total_opens == 0)
 866		zvol_last_close(zv);
 867	mutex_exit(&zfsdev_state_lock);
 868	return (err);
 869}
 870
 871/*ARGSUSED*/
 872int
 873zvol_close(dev_t dev, int flag, int otyp, cred_t *cr)
 874{
 875	minor_t minor = getminor(dev);
 876	zvol_state_t *zv;
 877	int error = 0;
 878
 879	mutex_enter(&zfsdev_state_lock);
 880
 881	zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
 882	if (zv == NULL) {
 883		mutex_exit(&zfsdev_state_lock);
 884		return (ENXIO);
 885	}
 886
 887	if (zv->zv_flags & ZVOL_EXCL) {
 888		ASSERT(zv->zv_total_opens == 1);
 889		zv->zv_flags &= ~ZVOL_EXCL;
 890	}
 891
 892	/*
 893	 * If the open count is zero, this is a spurious close.
 894	 * That indicates a bug in the kernel / DDI framework.
 895	 */
 896	ASSERT(zv->zv_open_count[otyp] != 0);
 897	ASSERT(zv->zv_total_opens != 0);
 898
 899	/*
 900	 * You may get multiple opens, but only one close.
 901	 */
 902	zv->zv_open_count[otyp]--;
 903	zv->zv_total_opens--;
 904
 905	if (zv->zv_total_opens == 0)
 906		zvol_last_close(zv);
 907
 908	mutex_exit(&zfsdev_state_lock);
 909	return (error);
 910}
 911
 912static void
 913zvol_get_done(zgd_t *zgd, int error)
 914{
 915	if (zgd->zgd_db)
 916		dmu_buf_rele(zgd->zgd_db, zgd);
 917
 918	zfs_range_unlock(zgd->zgd_rl);
 919
 920	if (error == 0 && zgd->zgd_bp)
 921		zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
 922
 923	kmem_free(zgd, sizeof (zgd_t));
 924}
 925
 926/*
 927 * Get data to generate a TX_WRITE intent log record.
 928 */
 929static int
 930zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
 931{
 932	zvol_state_t *zv = arg;
 933	objset_t *os = zv->zv_objset;
 934	uint64_t object = ZVOL_OBJ;
 935	uint64_t offset = lr->lr_offset;
 936	uint64_t size = lr->lr_length;	/* length of user data */
 937	blkptr_t *bp = &lr->lr_blkptr;
 938	dmu_buf_t *db;
 939	zgd_t *zgd;
 940	int error;
 941
 942	ASSERT(zio != NULL);
 943	ASSERT(size != 0);
 944
 945	zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
 946	zgd->zgd_zilog = zv->zv_zilog;
 947	zgd->zgd_rl = zfs_range_lock(&zv->zv_znode, offset, size, RL_READER);
 948
 949	/*
 950	 * Write records come in two flavors: immediate and indirect.
 951	 * For small writes it's cheaper to store the data with the
 952	 * log record (immediate); for large writes it's cheaper to
 953	 * sync the data and get a pointer to it (indirect) so that
 954	 * we don't have to write the data twice.
 955	 */
 956	if (buf != NULL) {	/* immediate write */
 957		error = dmu_read(os, object, offset, size, buf,
 958		    DMU_READ_NO_PREFETCH);
 959	} else {
 960		size = zv->zv_volblocksize;
 961		offset = P2ALIGN(offset, size);
 962		error = dmu_buf_hold(os, object, offset, zgd, &db,
 963		    DMU_READ_NO_PREFETCH);
 964		if (error == 0) {
 965			zgd->zgd_db = db;
 966			zgd->zgd_bp = bp;
 967
 968			ASSERT(db->db_offset == offset);
 969			ASSERT(db->db_size == size);
 970
 971			error = dmu_sync(zio, lr->lr_common.lrc_txg,
 972			    zvol_get_done, zgd);
 973
 974			if (error == 0)
 975				return (0);
 976		}
 977	}
 978
 979	zvol_get_done(zgd, error);
 980
 981	return (error);
 982}
 983
 984/*
 985 * zvol_log_write() handles synchronous writes using TX_WRITE ZIL transactions.
 986 *
 987 * We store data in the log buffers if it's small enough.
 988 * Otherwise we will later flush the data out via dmu_sync().
 989 */
 990ssize_t zvol_immediate_write_sz = 32768;
 991
 992static void
 993zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t resid,
 994    boolean_t sync)
 995{
 996	uint32_t blocksize = zv->zv_volblocksize;
 997	zilog_t *zilog = zv->zv_zilog;
 998	boolean_t slogging;
 999	ssize_t immediate_write_sz;
1000
1001	if (zil_replaying(zilog, tx))
1002		return;
1003
1004	immediate_write_sz = (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
1005	    ? 0 : zvol_immediate_write_sz;
1006
1007	slogging = spa_has_slogs(zilog->zl_spa) &&
1008	    (zilog->zl_logbias == ZFS_LOGBIAS_LATENCY);
1009
1010	while (resid) {
1011		itx_t *itx;
1012		lr_write_t *lr;
1013		ssize_t len;
1014		itx_wr_state_t write_state;
1015
1016		/*
1017		 * Unlike zfs_log_write() we can be called with
1018		 * upto DMU_MAX_ACCESS/2 (5MB) writes.
1019		 */
1020		if (blocksize > immediate_write_sz && !slogging &&
1021		    resid >= blocksize && off % blocksize == 0) {
1022			write_state = WR_INDIRECT; /* uses dmu_sync */
1023			len = blocksize;
1024		} else if (sync) {
1025			write_state = WR_COPIED;
1026			len = MIN(ZIL_MAX_LOG_DATA, resid);
1027		} else {
1028			write_state = WR_NEED_COPY;
1029			len = MIN(ZIL_MAX_LOG_DATA, resid);
1030		}
1031
1032		itx = zil_itx_create(TX_WRITE, sizeof (*lr) +
1033		    (write_state == WR_COPIED ? len : 0));
1034		lr = (lr_write_t *)&itx->itx_lr;
1035		if (write_state == WR_COPIED && dmu_read(zv->zv_objset,
1036		    ZVOL_OBJ, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) {
1037			zil_itx_destroy(itx);
1038			itx = zil_itx_create(TX_WRITE, sizeof (*lr));
1039			lr = (lr_write_t *)&itx->itx_lr;
1040			write_state = WR_NEED_COPY;
1041		}
1042
1043		itx->itx_wr_state = write_state;
1044		if (write_state == WR_NEED_COPY)
1045			itx->itx_sod += len;
1046		lr->lr_foid = ZVOL_OBJ;
1047		lr->lr_offset = off;
1048		lr->lr_length = len;
1049		lr->lr_blkoff = 0;
1050		BP_ZERO(&lr->lr_blkptr);
1051
1052		itx->itx_private = zv;
1053		itx->itx_sync = sync;
1054
1055		zil_itx_assign(zilog, itx, tx);
1056
1057		off += len;
1058		resid -= len;
1059	}
1060}
1061
1062static int
1063zvol_dumpio_vdev(vdev_t *vd, void *addr, uint64_t offset, uint64_t size,
1064    boolean_t doread, boolean_t isdump)
1065{
1066	vdev_disk_t *dvd;
1067	int c;
1068	int numerrors = 0;
1069
1070	for (c = 0; c < vd->vdev_children; c++) {
1071		ASSERT(vd->vdev_ops == &vdev_mirror_ops ||
1072		    vd->vdev_ops == &vdev_replacing_ops ||
1073		    vd->vdev_ops == &vdev_spare_ops);
1074		int err = zvol_dumpio_vdev(vd->vdev_child[c],
1075		    addr, offset, size, doread, isdump);
1076		if (err != 0) {
1077			numerrors++;
1078		} else if (doread) {
1079			break;
1080		}
1081	}
1082
1083	if (!vd->vdev_ops->vdev_op_leaf)
1084		return (numerrors < vd->vdev_children ? 0 : EIO);
1085
1086	if (doread && !vdev_readable(vd))
1087		return (EIO);
1088	else if (!doread && !vdev_writeable(vd))
1089		return (EIO);
1090
1091	dvd = vd->vdev_tsd;
1092	ASSERT3P(dvd, !=, NULL);
1093	offset += VDEV_LABEL_START_SIZE;
1094
1095	if (ddi_in_panic() || isdump) {
1096		ASSERT(!doread);
1097		if (doread)
1098			return (EIO);
1099		return (ldi_dump(dvd->vd_lh, addr, lbtodb(offset),
1100		    lbtodb(size)));
1101	} else {
1102		return (vdev_disk_physio(dvd->vd_lh, addr, size, offset,
1103		    doread ? B_READ : B_WRITE));
1104	}
1105}
1106
1107static int
1108zvol_dumpio(zvol_state_t *zv, void *addr, uint64_t offset, uint64_t size,
1109    boolean_t doread, boolean_t isdump)
1110{
1111	vdev_t *vd;
1112	int error;
1113	zvol_extent_t *ze;
1114	spa_t *spa = dmu_objset_spa(zv->zv_objset);
1115
1116	/* Must be sector aligned, and not stradle a block boundary. */
1117	if (P2PHASE(offset, DEV_BSIZE) || P2PHASE(size, DEV_BSIZE) ||
1118	    P2BOUNDARY(offset, size, zv->zv_volblocksize)) {
1119		return (EINVAL);
1120	}
1121	ASSERT(size <= zv->zv_volblocksize);
1122
1123	/* Locate the extent this belongs to */
1124	ze = list_head(&zv->zv_extents);
1125	while (offset >= ze->ze_nblks * zv->zv_volblocksize) {
1126		offset -= ze->ze_nblks * zv->zv_volblocksize;
1127		ze = list_next(&zv->zv_extents, ze);
1128	}
1129
1130	if (!ddi_in_panic())
1131		spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
1132
1133	vd = vdev_lookup_top(spa, DVA_GET_VDEV(&ze->ze_dva));
1134	offset += DVA_GET_OFFSET(&ze->ze_dva);
1135	error = zvol_dumpio_vdev(vd, addr, offset, size, doread, isdump);
1136
1137	if (!ddi_in_panic())
1138		spa_config_exit(spa, SCL_STATE, FTAG);
1139
1140	return (error);
1141}
1142
1143int
1144zvol_strategy(buf_t *bp)
1145{
1146	zfs_soft_state_t *zs = NULL;
1147	zvol_state_t *zv;
1148	uint64_t off, volsize;
1149	size_t resid;
1150	char *addr;
1151	objset_t *os;
1152	rl_t *rl;
1153	int error = 0;
1154	boolean_t doread = bp->b_flags & B_READ;
1155	boolean_t is_dump;
1156	boolean_t sync;
1157
1158	if (getminor(bp->b_edev) == 0) {
1159		error = EINVAL;
1160	} else {
1161		zs = ddi_get_soft_state(zfsdev_state, getminor(bp->b_edev));
1162		if (zs == NULL)
1163			error = ENXIO;
1164		else if (zs->zss_type != ZSST_ZVOL)
1165			error = EINVAL;
1166	}
1167
1168	if (error) {
1169		bioerror(bp, error);
1170		biodone(bp);
1171		return (0);
1172	}
1173
1174	zv = zs->zss_data;
1175
1176	if (!(bp->b_flags & B_READ) && (zv->zv_flags & ZVOL_RDONLY)) {
1177		bioerror(bp, EROFS);
1178		biodone(bp);
1179		return (0);
1180	}
1181
1182	off = ldbtob(bp->b_blkno);
1183	volsize = zv->zv_volsize;
1184
1185	os = zv->zv_objset;
1186	ASSERT(os != NULL);
1187
1188	bp_mapin(bp);
1189	addr = bp->b_un.b_addr;
1190	resid = bp->b_bcount;
1191
1192	if (resid > 0 && (off < 0 || off >= volsize)) {
1193		bioerror(bp, EIO);
1194		biodone(bp);
1195		return (0);
1196	}
1197
1198	is_dump = zv->zv_flags & ZVOL_DUMPIFIED;
1199	sync = ((!(bp->b_flags & B_ASYNC) &&
1200	    !(zv->zv_flags & ZVOL_WCE)) ||
1201	    (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS)) &&
1202	    !doread && !is_dump;
1203
1204	/*
1205	 * There must be no buffer changes when doing a dmu_sync() because
1206	 * we can't change the data whilst calculating the checksum.
1207	 */
1208	rl = zfs_range_lock(&zv->zv_znode, off, resid,
1209	    doread ? RL_READER : RL_WRITER);
1210
1211	while (resid != 0 && off < volsize) {
1212		size_t size = MIN(resid, zvol_maxphys);
1213		if (is_dump) {
1214			size = MIN(size, P2END(off, zv->zv_volblocksize) - off);
1215			error = zvol_dumpio(zv, addr, off, size,
1216			    doread, B_FALSE);
1217		} else if (doread) {
1218			error = dmu_read(os, ZVOL_OBJ, off, size, addr,
1219			    DMU_READ_PREFETCH);
1220		} else {
1221			dmu_tx_t *tx = dmu_tx_create(os);
1222			dmu_tx_hold_write(tx, ZVOL_OBJ, off, size);
1223			error = dmu_tx_assign(tx, TXG_WAIT);
1224			if (error) {
1225				dmu_tx_abort(tx);
1226			} else {
1227				dmu_write(os, ZVOL_OBJ, off, size, addr, tx);
1228				zvol_log_write(zv, tx, off, size, sync);
1229				dmu_tx_commit(tx);
1230			}
1231		}
1232		if (error) {
1233			/* convert checksum errors into IO errors */
1234			if (error == ECKSUM)
1235				error = EIO;
1236			break;
1237		}
1238		off += size;
1239		addr += size;
1240		resid -= size;
1241	}
1242	zfs_range_unlock(rl);
1243
1244	if ((bp->b_resid = resid) == bp->b_bcount)
1245		bioerror(bp, off > volsize ? EINVAL : error);
1246
1247	if (sync)
1248		zil_commit(zv->zv_zilog, ZVOL_OBJ);
1249	biodone(bp);
1250
1251	return (0);
1252}
1253
1254/*
1255 * Set the buffer count to the zvol maximum transfer.
1256 * Using our own routine instead of the default minphys()
1257 * means that for larger writes we write bigger buffers on X86
1258 * (128K instead of 56K) and flush the disk write cache less often
1259 * (every zvol_maxphys - currently 1MB) instead of minphys (currently
1260 * 56K on X86 and 128K on sparc).
1261 */
1262void
1263zvol_minphys(struct buf *bp)
1264{
1265	if (bp->b_bcount > zvol_maxphys)
1266		bp->b_bcount = zvol_maxphys;
1267}
1268
1269int
1270zvol_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblocks)
1271{
1272	minor_t minor = getminor(dev);
1273	zvol_state_t *zv;
1274	int error = 0;
1275	uint64_t size;
1276	uint64_t boff;
1277	uint64_t resid;
1278
1279	zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
1280	if (zv == NULL)
1281		return (ENXIO);
1282
1283	boff = ldbtob(blkno);
1284	resid = ldbtob(nblocks);
1285
1286	VERIFY3U(boff + resid, <=, zv->zv_volsize);
1287
1288	while (resid) {
1289		size = MIN(resid, P2END(boff, zv->zv_volblocksize) - boff);
1290		error = zvol_dumpio(zv, addr, boff, size, B_FALSE, B_TRUE);
1291		if (error)
1292			break;
1293		boff += size;
1294		addr += size;
1295		resid -= size;
1296	}
1297
1298	return (error);
1299}
1300
1301/*ARGSUSED*/
1302int
1303zvol_read(dev_t dev, uio_t *uio, cred_t *cr)
1304{
1305	minor_t minor = getminor(dev);
1306	zvol_state_t *zv;
1307	uint64_t volsize;
1308	rl_t *rl;
1309	int error = 0;
1310
1311	zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
1312	if (zv == NULL)
1313		return (ENXIO);
1314
1315	volsize = zv->zv_volsize;
1316	if (uio->uio_resid > 0 &&
1317	    (uio->uio_loffset < 0 || uio->uio_loffset >= volsize))
1318		return (EIO);
1319
1320	if (zv->zv_flags & ZVOL_DUMPIFIED) {
1321		error = physio(zvol_strategy, NULL, dev, B_READ,
1322		    zvol_minphys, uio);
1323		return (error);
1324	}
1325
1326	rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid,
1327	    RL_READER);
1328	while (uio->uio_resid > 0 && uio->uio_loffset < volsize) {
1329		uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1);
1330
1331		/* don't read past the end */
1332		if (bytes > volsize - uio->uio_loffset)
1333			bytes = volsize - uio->uio_loffset;
1334
1335		error =  dmu_read_uio(zv->zv_objset, ZVOL_OBJ, uio, bytes);
1336		if (error) {
1337			/* convert checksum errors into IO errors */
1338			if (error == ECKSUM)
1339				error = EIO;
1340			break;
1341		}
1342	}
1343	zfs_range_unlock(rl);
1344	return (error);
1345}
1346
1347/*ARGSUSED*/
1348int
1349zvol_write(dev_t dev, uio_t *uio, cred_t *cr)
1350{
1351	minor_t minor = getminor(dev);
1352	zvol_state_t *zv;
1353	uint64_t volsize;
1354	rl_t *rl;
1355	int error = 0;
1356	boolean_t sync;
1357
1358	zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
1359	if (zv == NULL)
1360		return (ENXIO);
1361
1362	volsize = zv->zv_volsize;
1363	if (uio->uio_resid > 0 &&
1364	    (uio->uio_loffset < 0 || uio->uio_loffset >= volsize))
1365		return (EIO);
1366
1367	if (zv->zv_flags & ZVOL_DUMPIFIED) {
1368		error = physio(zvol_strategy, NULL, dev, B_WRITE,
1369		    zvol_minphys, uio);
1370		return (error);
1371	}
1372
1373	sync = !(zv->zv_flags & ZVOL_WCE) ||
1374	    (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
1375
1376	rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid,
1377	    RL_WRITER);
1378	while (uio->uio_resid > 0 && uio->uio_loffset < volsize) {
1379		uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1);
1380		uint64_t off = uio->uio_loffset;
1381		dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
1382
1383		if (bytes > volsize - off)	/* don't write past the end */
1384			bytes = volsize - off;
1385
1386		dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes);
1387		error = dmu_tx_assign(tx, TXG_WAIT);
1388		if (error) {
1389			dmu_tx_abort(tx);
1390			break;
1391		}
1392		error = dmu_write_uio_dbuf(zv->zv_dbuf, uio, bytes, tx);
1393		if (error == 0)
1394			zvol_log_write(zv, tx, off, bytes, sync);
1395		dmu_tx_commit(tx);
1396
1397		if (error)
1398			break;
1399	}
1400	zfs_range_unlock(rl);
1401	if (sync)
1402		zil_commit(zv->zv_zilog, ZVOL_OBJ);
1403	return (error);
1404}
1405
1406int
1407zvol_getefi(void *arg, int flag, uint64_t vs, uint8_t bs)
1408{
1409	struct uuid uuid = EFI_RESERVED;
1410	efi_gpe_t gpe = { 0 };
1411	uint32_t crc;
1412	dk_efi_t efi;
1413	int length;
1414	char *ptr;
1415
1416	if (ddi_copyin(arg, &efi, sizeof (dk_efi_t), flag))
1417		return (EFAULT);
1418	ptr = (char *)(uintptr_t)efi.dki_data_64;
1419	length = efi.dki_length;
1420	/*
1421	 * Some clients may attempt to request a PMBR for the
1422	 * zvol.  Currently this interface will return EINVAL to
1423	 * such requests.  These requests could be supported by
1424	 * adding a check for lba == 0 and consing up an appropriate
1425	 * PMBR.
1426	 */
1427	if (efi.dki_lba < 1 || efi.dki_lba > 2 || length <= 0)
1428		return (EINVAL);
1429
1430	gpe.efi_gpe_StartingLBA = LE_64(34ULL);
1431	gpe.efi_gpe_EndingLBA = LE_64((vs >> bs) - 1);
1432	UUID_LE_CONVERT(gpe.efi_gpe_PartitionTypeGUID, uuid);
1433
1434	if (efi.dki_lba == 1) {
1435		efi_gpt_t gpt = { 0 };
1436
1437		gpt.efi_gpt_Signature = LE_64(EFI_SIGNATURE);
1438		gpt.efi_gpt_Revision = LE_32(EFI_VERSION_CURRENT);
1439		gpt.efi_gpt_HeaderSize = LE_32(sizeof (gpt));
1440		gpt.efi_gpt_MyLBA = LE_64(1ULL);
1441		gpt.efi_gpt_FirstUsableLBA = LE_64(34ULL);
1442		gpt.efi_gpt_LastUsableLBA = LE_64((vs >> bs) - 1);
1443		gpt.efi_gpt_PartitionEntryLBA = LE_64(2ULL);
1444		gpt.efi_gpt_NumberOfPartitionEntries = LE_32(1);
1445		gpt.efi_gpt_SizeOfPartitionEntry =
1446		    LE_32(sizeof (efi_gpe_t));
1447		CRC32(crc, &gpe, sizeof (gpe), -1U, crc32_table);
1448		gpt.efi_gpt_PartitionEntryArrayCRC32 = LE_32(~crc);
1449		CRC32(crc, &gpt, sizeof (gpt), -1U, crc32_table);
1450		gpt.efi_gpt_HeaderCRC32 = LE_32(~crc);
1451		if (ddi_copyout(&gpt, ptr, MIN(sizeof (gpt), length),
1452		    flag))
1453			return (EFAULT);
1454		ptr += sizeof (gpt);
1455		length -= sizeof (gpt);
1456	}
1457	if (length > 0 && ddi_copyout(&gpe, ptr, MIN(sizeof (gpe),
1458	    length), flag))
1459		return (EFAULT);
1460	return (0);
1461}
1462
1463/*
1464 * BEGIN entry points to allow external callers access to the volume.
1465 */
1466/*
1467 * Return the volume parameters needed for access from an external caller.
1468 * These values are invariant as long as the volume is held open.
1469 */
1470int
1471zvol_get_volume_params(minor_t minor, uint64_t *blksize,
1472    uint64_t *max_xfer_len, void **minor_hdl, void **objset_hdl, void **zil_hdl,
1473    void **rl_hdl, void **bonus_hdl)
1474{
1475	zvol_state_t *zv;
1476
1477	zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
1478	if (zv == NULL)
1479		return (ENXIO);
1480	if (zv->zv_flags & ZVOL_DUMPIFIED)
1481		return (ENXIO);
1482
1483	ASSERT(blksize && max_xfer_len && minor_hdl &&
1484	    objset_hdl && zil_hdl && rl_hdl && bonus_hdl);
1485
1486	*blksize = zv->zv_volblocksize;
1487	*max_xfer_len = (uint64_t)zvol_maxphys;
1488	*minor_hdl = zv;
1489	*objset_hdl = zv->zv_objset;
1490	*zil_hdl = zv->zv_zilog;
1491	*rl_hdl = &zv->zv_znode;
1492	*bonus_hdl = zv->zv_dbuf;
1493	return (0);
1494}
1495
1496/*
1497 * Return the current volume size to an external caller.
1498 * The size can change while the volume is open.
1499 */
1500uint64_t
1501zvol_get_volume_size(void *minor_hdl)
1502{
1503	zvol_state_t *zv = minor_hdl;
1504
1505	return (zv->zv_volsize);
1506}
1507
1508/*
1509 * Return the current WCE setting to an external caller.
1510 * The WCE setting can change while the volume is open.
1511 */
1512int
1513zvol_get_volume_wce(void *minor_hdl)
1514{
1515	zvol_state_t *zv = minor_hdl;
1516
1517	return ((zv->zv_flags & ZVOL_WCE) ? 1 : 0);
1518}
1519
1520/*
1521 * Entry point for external callers to zvol_log_write
1522 */
1523void
1524zvol_log_write_minor(void *minor_hdl, dmu_tx_t *tx, offset_t off, ssize_t resid,
1525    boolean_t sync)
1526{
1527	zvol_state_t *zv = minor_hdl;
1528
1529	zvol_log_write(zv, tx, off, resid, sync);
1530}
1531/*
1532 * END entry points to allow external callers access to the volume.
1533 */
1534
1535/*
1536 * Log a DKIOCFREE/free-long-range to the ZIL with TX_TRUNCATE.
1537 */
1538static void
1539zvol_log_truncate(zvol_state_t *zv, dmu_tx_t *tx, uint64_t off, uint64_t len,
1540    boolean_t sync)
1541{
1542	itx_t *itx;
1543	lr_truncate_t *lr;
1544	zilog_t *zilog = zv->zv_zilog;
1545
1546	if (zil_replaying(zilog, tx))
1547		return;
1548
1549	itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr));
1550	lr = (lr_truncate_t *)&itx->itx_lr;
1551	lr->lr_foid = ZVOL_OBJ;
1552	lr->lr_offset = off;
1553	lr->lr_length = len;
1554
1555	itx->itx_sync = sync;
1556	zil_itx_assign(zilog, itx, tx);
1557}
1558
1559/*
1560 * Dirtbag ioctls to support mkfs(1M) for UFS filesystems.  See dkio(7I).
1561 * Also a dirtbag dkio ioctl for unmap/free-block functionality.
1562 */
1563/*ARGSUSED*/
1564int
1565zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
1566{
1567	zvol_state_t *zv;
1568	struct dk_cinfo dki;
1569	struct dk_minfo dkm;
1570	struct dk_callback *dkc;
1571	int error = 0;
1572	rl_t *rl;
1573
1574	mutex_enter(&zfsdev_state_lock);
1575
1576	zv = zfsdev_get_soft_state(getminor(dev), ZSST_ZVOL);
1577
1578	if (zv == NULL) {
1579		mutex_exit(&zfsdev_state_lock);
1580		return (ENXIO);
1581	}
1582	ASSERT(zv->zv_total_opens > 0);
1583
1584	switch (cmd) {
1585
1586	case DKIOCINFO:
1587		bzero(&dki, sizeof (dki));
1588		(void) strcpy(dki.dki_cname, "zvol");
1589		(void) strcpy(dki.dki_dname, "zvol");
1590		dki.dki_ctype = DKC_UNKNOWN;
1591		dki.dki_unit = getminor(dev);
1592		dki.dki_maxtransfer = 1 << (SPA_MAXBLOCKSHIFT - zv->zv_min_bs);
1593		mutex_exit(&zfsdev_state_lock);
1594		if (ddi_copyout(&dki, (void *)arg, sizeof (dki), flag))
1595			error = EFAULT;
1596		return (error);
1597
1598	case DKIOCGMEDIAINFO:
1599		bzero(&dkm, sizeof (dkm));
1600		dkm.dki_lbsize = 1U << zv->zv_min_bs;
1601		dkm.dki_capacity = zv->zv_volsize >> zv->zv_min_bs;
1602		dkm.dki_media_type = DK_UNKNOWN;
1603		mutex_exit(&zfsdev_state_lock);
1604		if (ddi_copyout(&dkm, (void *)arg, sizeof (dkm), flag))
1605			error = EFAULT;
1606		return (error);
1607
1608	case DKIOCGETEFI:
1609		{
1610			uint64_t vs = zv->zv_volsize;
1611			uint8_t bs = zv->zv_min_bs;
1612
1613			mutex_exit(&zfsdev_state_lock);
1614			error = zvol_getefi((void *)arg, flag, vs, bs);
1615			return (error);
1616		}
1617
1618	case DKIOCFLUSHWRITECACHE:
1619		dkc = (struct dk_callback *)arg;
1620		mutex_exit(&zfsdev_state_lock);
1621		zil_commit(zv->zv_zilog, ZVOL_OBJ);
1622		if ((flag & FKIOCTL) && dkc != NULL && dkc->dkc_callback) {
1623			(*dkc->dkc_callback)(dkc->dkc_cookie, error);
1624			error = 0;
1625		}
1626		return (error);
1627
1628	case DKIOCGETWCE:
1629		{
1630			int wce = (zv->zv_flags & ZVOL_WCE) ? 1 : 0;
1631			if (ddi_copyout(&wce, (void *)arg, sizeof (int),
1632			    flag))
1633				error = EFAULT;
1634			break;
1635		}
1636	case DKIOCSETWCE:
1637		{
1638			int wce;
1639			if (ddi_copyin((void *)arg, &wce, sizeof (int),
1640			    flag)) {
1641				error = EFAULT;
1642				break;
1643			}
1644			if (wce) {
1645				zv->zv_flags |= ZVOL_WCE;
1646				mutex_exit(&zfsdev_state_lock);
1647			} else {
1648				zv->zv_flags &= ~ZVOL_WCE;
1649				mutex_exit(&zfsdev_state_lock);
1650				zil_commit(zv->zv_zilog, ZVOL_OBJ);
1651			}
1652			return (0);
1653		}
1654
1655	case DKIOCGGEOM:
1656	case DKIOCGVTOC:
1657		/*
1658		 * commands using these (like prtvtoc) expect ENOTSUP
1659		 * since we're emulating an EFI label
1660		 */
1661		error = ENOTSUP;
1662		break;
1663
1664	case DKIOCDUMPINIT:
1665		rl = zfs_range_lock(&zv->zv_znode, 0, zv->zv_volsize,
1666		    RL_WRITER);
1667		error = zvol_dumpify(zv);
1668		zfs_range_unlock(rl);
1669		break;
1670
1671	case DKIOCDUMPFINI:
1672		if (!(zv->zv_flags & ZVOL_DUMPIFIED))
1673			break;
1674		rl = zfs_range_lock(&zv->zv_znode, 0, zv->zv_volsize,
1675		    RL_WRITER);
1676		error = zvol_dump_fini(zv);
1677		zfs_range_unlock(rl);
1678		break;
1679
1680	case DKIOCFREE:
1681	{
1682		dkioc_free_t df;
1683		dmu_tx_t *tx;
1684
1685		if (ddi_copyin((void *)arg, &df, sizeof (df), flag)) {
1686			error = EFAULT;
1687			break;
1688		}
1689
1690		/*
1691		 * Apply Postel's Law to length-checking.  If they overshoot,
1692		 * just blank out until the end, if there's a need to blank
1693		 * out anything.
1694		 */
1695		if (df.df_start >= zv->zv_volsize)
1696			break;	/* No need to do anything... */
1697		if (df.df_start + df.df_length > zv->zv_volsize)
1698			df.df_length = DMU_OBJECT_END;
1699
1700		rl = zfs_range_lock(&zv->zv_znode, df.df_start, df.df_length,
1701		    RL_WRITER);
1702		tx = dmu_tx_create(zv->zv_objset);
1703		error = dmu_tx_assign(tx, TXG_WAIT);
1704		if (error != 0) {
1705			dmu_tx_abort(tx);
1706		} else {
1707			zvol_log_truncate(zv, tx, df.df_start,
1708			    df.df_length, B_TRUE);
1709			dmu_tx_commit(tx);
1710			error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
1711			    df.df_start, df.df_length);
1712		}
1713
1714		zfs_range_unlock(rl);
1715
1716		if (error == 0) {
1717			/*
1718			 * If the write-cache is disabled or 'sync' property
1719			 * is set to 'always' then treat this as a synchronous
1720			 * operation (i.e. commit to zil).
1721			 */
1722			if (!(zv->zv_flags & ZVOL_WCE) ||
1723			    (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS))
1724				zil_commit(zv->zv_zilog, ZVOL_OBJ);
1725
1726			/*
1727			 * If the caller really wants synchronous writes, and
1728			 * can't wait for them, don't return until the write
1729			 * is done.
1730			 */
1731			if (df.df_flags & DF_WAIT_SYNC) {
1732				txg_wait_synced(
1733				    dmu_objset_pool(zv->zv_objset), 0);
1734			}
1735		}
1736		break;
1737	}
1738
1739	default:
1740		error = ENOTTY;
1741		break;
1742
1743	}
1744	mutex_exit(&zfsdev_state_lock);
1745	return (error);
1746}
1747
1748int
1749zvol_busy(void)
1750{
1751	return (zvol_minors != 0);
1752}
1753
1754void
1755zvol_init(void)
1756{
1757	VERIFY(ddi_soft_state_init(&zfsdev_state, sizeof (zfs_soft_state_t),
1758	    1) == 0);
1759	mutex_init(&zfsdev_state_lock, NULL, MUTEX_DEFAULT, NULL);
1760}
1761
1762void
1763zvol_fini(void)
1764{
1765	mutex_destroy(&zfsdev_state_lock);
1766	ddi_soft_state_fini(&zfsdev_state);
1767}
1768
1769static int
1770zvol_dump_init(zvol_state_t *zv, boolean_t resize)
1771{
1772	dmu_tx_t *tx;
1773	int error = 0;
1774	objset_t *os = zv->zv_objset;
1775	nvlist_t *nv = NULL;
1776	uint64_t version = spa_version(dmu_objset_spa(zv->zv_objset));
1777
1778	ASSERT(MUTEX_HELD(&zfsdev_state_lock));
1779	error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, 0,
1780	    DMU_OBJECT_END);
1781	/* wait for dmu_free_long_range to actually free the blocks */
1782	txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
1783
1784	tx = dmu_tx_create(os);
1785	dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
1786	dmu_tx_hold_bonus(tx, ZVOL_OBJ);
1787	error = dmu_tx_assign(tx, TXG_WAIT);
1788	if (error) {
1789		dmu_tx_abort(tx);
1790		return (error);
1791	}
1792
1793	/*
1794	 * If we are resizing the dump device then we only need to
1795	 * update the refreservation to match the newly updated
1796	 * zvolsize. Otherwise, we save off the original state of the
1797	 * zvol so that we can restore them if the zvol is ever undumpified.
1798	 */
1799	if (resize) {
1800		error = zap_update(os, ZVOL_ZAP_OBJ,
1801		    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1,
1802		    &zv->zv_volsize, tx);
1803	} else {
1804		uint64_t checksum, compress, refresrv, vbs, dedup;
1805
1806		error = dsl_prop_get_integer(zv->zv_name,
1807		    zfs_prop_to_name(ZFS_PROP_COMPRESSION), &compress, NULL);
1808		error = error ? error : dsl_prop_get_integer(zv->zv_name,
1809		    zfs_prop_to_name(ZFS_PROP_CHECKSUM), &checksum, NULL);
1810		error = error ? error : dsl_prop_get_integer(zv->zv_name,
1811		    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), &refresrv, NULL);
1812		error = error ? error : dsl_prop_get_integer(zv->zv_name,
1813		    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &vbs, NULL);
1814		if (version >= SPA_VERSION_DEDUP) {
1815			error = error ? error :
1816			    dsl_prop_get_integer(zv->zv_name,
1817			    zfs_prop_to_name(ZFS_PROP_DEDUP), &dedup, NULL);
1818		}
1819
1820		error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
1821		    zfs_prop_to_name(ZFS_PROP_COMPRESSION), 8, 1,
1822		    &compress, tx);
1823		error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
1824		    zfs_prop_to_name(ZFS_PROP_CHECKSUM), 8, 1, &checksum, tx);
1825		error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
1826		    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1,
1827		    &refresrv, tx);
1828		error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
1829		    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), 8, 1,
1830		    &vbs, tx);
1831		error = error ? error : dmu_object_set_blocksize(
1832		    os, ZVOL_OBJ, SPA_MAXBLOCKSIZE, 0, tx);
1833		if (version >= SPA_VERSION_DEDUP) {
1834			error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
1835			    zfs_prop_to_name(ZFS_PROP_DEDUP), 8, 1,
1836			    &dedup, tx);
1837		}
1838		if (error == 0)
1839			zv->zv_volblocksize = SPA_MAXBLOCKSIZE;
1840	}
1841	dmu_tx_commit(tx);
1842
1843	/*
1844	 * We only need update the zvol's property if we are initializing
1845	 * the dump area for the first time.
1846	 */
1847	if (!resize) {
1848		VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
1849		VERIFY(nvlist_add_uint64(nv,
1850		    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 0) == 0);
1851		VERIFY(nvlist_add_uint64(nv,
1852		    zfs_prop_to_name(ZFS_PROP_COMPRESSION),
1853		    ZIO_COMPRESS_OFF) == 0);
1854		VERIFY(nvlist_add_uint64(nv,
1855		    zfs_prop_to_name(ZFS_PROP_CHECKSUM),
1856		    ZIO_CHECKSUM_OFF) == 0);
1857		if (version >= SPA_VERSION_DEDUP) {
1858			VERIFY(nvlist_add_uint64(nv,
1859			    zfs_prop_to_name(ZFS_PROP_DEDUP),
1860			    ZIO_CHECKSUM_OFF) == 0);
1861		}
1862
1863		error = zfs_set_prop_nvlist(zv->zv_name, ZPROP_SRC_LOCAL,
1864		    nv, NULL);
1865		nvlist_free(nv);
1866
1867		if (error)
1868			return (error);
1869	}
1870
1871	/* Allocate the space for the dump */
1872	error = zvol_prealloc(zv);
1873	return (error);
1874}
1875
1876static int
1877zvol_dumpify(zvol_state_t *zv)
1878{
1879	int error = 0;
1880	uint64_t dumpsize = 0;
1881	dmu_tx_t *tx;
1882	objset_t *os = zv->zv_objset;
1883
1884	if (zv->zv_flags & ZVOL_RDONLY)
1885		return (EROFS);
1886
1887	if (zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE,
1888	    8, 1, &dumpsize) != 0 || dumpsize != zv->zv_volsize) {
1889		boolean_t resize = (dumpsize > 0);
1890
1891		if ((error = zvol_dump_init(zv, resize)) != 0) {
1892			(void) zvol_dump_fini(zv);
1893			return (error);
1894		}
1895	}
1896
1897	/*
1898	 * Build up our lba mapping.
1899	 */
1900	error = zvol_get_lbas(zv);
1901	if (error) {
1902		(void) zvol_dump_fini(zv);
1903		return (error);
1904	}
1905
1906	tx = dmu_tx_create(os);
1907	dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
1908	error = dmu_tx_assign(tx, TXG_WAIT);
1909	if (error) {
1910		dmu_tx_abort(tx);
1911		(void) zvol_dump_fini(zv);
1912		return (error);
1913	}
1914
1915	zv->zv_flags |= ZVOL_DUMPIFIED;
1916	error = zap_update(os, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE, 8, 1,
1917	    &zv->zv_volsize, tx);
1918	dmu_tx_commit(tx);
1919
1920	if (error) {
1921		(void) zvol_dump_fini(zv);
1922		return (error);
1923	}
1924
1925	txg_wait_synced(dmu_objset_pool(os), 0);
1926	return (0);
1927}
1928
1929static int
1930zvol_dump_fini(zvol_state_t *zv)
1931{
1932	dmu_tx_t *tx;
1933	objset_t *os = zv->zv_objset;
1934	nvlist_t *nv;
1935	int error = 0;
1936	uint64_t checksum, compress, refresrv, vbs, dedup;
1937	uint64_t version = spa_version(dmu_objset_spa(zv->zv_objset));
1938
1939	/*
1940	 * Attempt to restore the zvol back to its pre-dumpified state.
1941	 * This is a best-effort attempt as it's possible that not all
1942	 * of these properties were initialized during the dumpify process
1943	 * (i.e. error during zvol_dump_init).
1944	 */
1945
1946	tx = dmu_tx_create(os);
1947	dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
1948	error = dmu_tx_assign(tx, TXG_WAIT);
1949	if (error) {
1950		dmu_tx_abort(tx);
1951		return (error);
1952	}
1953	(void) zap_remove(os, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE, tx);
1954	dmu_tx_commit(tx);
1955
1956	(void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
1957	    zfs_prop_to_name(ZFS_PROP_CHECKSUM), 8, 1, &checksum);
1958	(void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
1959	    zfs_prop_to_name(ZFS_PROP_COMPRESSION), 8, 1, &compress);
1960	(void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
1961	    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1, &refresrv);
1962	(void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
1963	    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), 8, 1, &vbs);
1964
1965	VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
1966	(void) nvlist_add_uint64(nv,
1967	    zfs_prop_to_name(ZFS_PROP_CHECKSUM), checksum);
1968	(void) nvlist_add_uint64(nv,
1969	    zfs_prop_to_name(ZFS_PROP_COMPRESSION), compress);
1970	(void) nvlist_add_uint64(nv,
1971	    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), refresrv);
1972	if (version >= SPA_VERSION_DEDUP &&
1973	    zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
1974	    zfs_prop_to_name(ZFS_PROP_DEDUP), 8, 1, &dedup) == 0) {
1975		(void) nvlist_add_uint64(nv,
1976		    zfs_prop_to_name(ZFS_PROP_DEDUP), dedup);
1977	}
1978	(void) zfs_set_prop_nvlist(zv->zv_name, ZPROP_SRC_LOCAL,
1979	    nv, NULL);
1980	nvlist_free(nv);
1981
1982	zvol_free_extents(zv);
1983	zv->zv_flags &= ~ZVOL_DUMPIFIED;
1984	(void) dmu_free_long_range(os, ZVOL_OBJ, 0, DMU_OBJECT_END);
1985	/* wait for dmu_free_long_range to actually free the blocks */
1986	txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
1987	tx = dmu_tx_create(os);
1988	dmu_tx_hold_bonus(tx, ZVOL_OBJ);
1989	error = dmu_tx_assign(tx, TXG_WAIT);
1990	if (error) {
1991		dmu_tx_abort(tx);
1992		return (error);
1993	}
1994	if (dmu_object_set_blocksize(os, ZVOL_OBJ, vbs, 0, tx) == 0)
1995		zv->zv_volblocksize = vbs;
1996	dmu_tx_commit(tx);
1997
1998	return (0);
1999}