PageRenderTime 95ms CodeModel.GetById 3ms app.highlight 78ms RepoModel.GetById 1ms app.codeStats 1ms

/usr/src/uts/common/fs/zfs/zfs_vfsops.c

https://bitbucket.org/0xffea/illumos-dccp
C | 2317 lines | 1513 code | 329 blank | 475 comment | 398 complexity | 98755ed360f2f2c3ab0cc974ddf7e243 MD5 | raw file

Large files files are truncated, but you can click here to view the full file

   1/*
   2 * CDDL HEADER START
   3 *
   4 * The contents of this file are subject to the terms of the
   5 * Common Development and Distribution License (the "License").
   6 * You may not use this file except in compliance with the License.
   7 *
   8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9 * or http://www.opensolaris.org/os/licensing.
  10 * See the License for the specific language governing permissions
  11 * and limitations under the License.
  12 *
  13 * When distributing Covered Code, include this CDDL HEADER in each
  14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15 * If applicable, add the following below this CDDL HEADER, with the
  16 * fields enclosed by brackets "[]" replaced with your own identifying
  17 * information: Portions Copyright [yyyy] [name of copyright owner]
  18 *
  19 * CDDL HEADER END
  20 */
  21/*
  22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23 * Copyright (c) 2012 by Delphix. All rights reserved.
  24 */
  25
  26/* Portions Copyright 2010 Robert Milkowski */
  27
  28#include <sys/types.h>
  29#include <sys/param.h>
  30#include <sys/systm.h>
  31#include <sys/sysmacros.h>
  32#include <sys/kmem.h>
  33#include <sys/pathname.h>
  34#include <sys/vnode.h>
  35#include <sys/vfs.h>
  36#include <sys/vfs_opreg.h>
  37#include <sys/mntent.h>
  38#include <sys/mount.h>
  39#include <sys/cmn_err.h>
  40#include "fs/fs_subr.h"
  41#include <sys/zfs_znode.h>
  42#include <sys/zfs_dir.h>
  43#include <sys/zil.h>
  44#include <sys/fs/zfs.h>
  45#include <sys/dmu.h>
  46#include <sys/dsl_prop.h>
  47#include <sys/dsl_dataset.h>
  48#include <sys/dsl_deleg.h>
  49#include <sys/spa.h>
  50#include <sys/zap.h>
  51#include <sys/sa.h>
  52#include <sys/varargs.h>
  53#include <sys/policy.h>
  54#include <sys/atomic.h>
  55#include <sys/mkdev.h>
  56#include <sys/modctl.h>
  57#include <sys/refstr.h>
  58#include <sys/zfs_ioctl.h>
  59#include <sys/zfs_ctldir.h>
  60#include <sys/zfs_fuid.h>
  61#include <sys/bootconf.h>
  62#include <sys/sunddi.h>
  63#include <sys/dnlc.h>
  64#include <sys/dmu_objset.h>
  65#include <sys/spa_boot.h>
  66#include <sys/sa.h>
  67#include "zfs_comutil.h"
  68
  69int zfsfstype;
  70vfsops_t *zfs_vfsops = NULL;
  71static major_t zfs_major;
  72static minor_t zfs_minor;
  73static kmutex_t	zfs_dev_mtx;
  74
  75extern int sys_shutdown;
  76
  77static int zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr);
  78static int zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr);
  79static int zfs_mountroot(vfs_t *vfsp, enum whymountroot);
  80static int zfs_root(vfs_t *vfsp, vnode_t **vpp);
  81static int zfs_statvfs(vfs_t *vfsp, struct statvfs64 *statp);
  82static int zfs_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp);
  83static void zfs_freevfs(vfs_t *vfsp);
  84
  85static const fs_operation_def_t zfs_vfsops_template[] = {
  86	VFSNAME_MOUNT,		{ .vfs_mount = zfs_mount },
  87	VFSNAME_MOUNTROOT,	{ .vfs_mountroot = zfs_mountroot },
  88	VFSNAME_UNMOUNT,	{ .vfs_unmount = zfs_umount },
  89	VFSNAME_ROOT,		{ .vfs_root = zfs_root },
  90	VFSNAME_STATVFS,	{ .vfs_statvfs = zfs_statvfs },
  91	VFSNAME_SYNC,		{ .vfs_sync = zfs_sync },
  92	VFSNAME_VGET,		{ .vfs_vget = zfs_vget },
  93	VFSNAME_FREEVFS,	{ .vfs_freevfs = zfs_freevfs },
  94	NULL,			NULL
  95};
  96
  97static const fs_operation_def_t zfs_vfsops_eio_template[] = {
  98	VFSNAME_FREEVFS,	{ .vfs_freevfs =  zfs_freevfs },
  99	NULL,			NULL
 100};
 101
 102/*
 103 * We need to keep a count of active fs's.
 104 * This is necessary to prevent our module
 105 * from being unloaded after a umount -f
 106 */
 107static uint32_t	zfs_active_fs_count = 0;
 108
 109static char *noatime_cancel[] = { MNTOPT_ATIME, NULL };
 110static char *atime_cancel[] = { MNTOPT_NOATIME, NULL };
 111static char *noxattr_cancel[] = { MNTOPT_XATTR, NULL };
 112static char *xattr_cancel[] = { MNTOPT_NOXATTR, NULL };
 113
 114/*
 115 * MO_DEFAULT is not used since the default value is determined
 116 * by the equivalent property.
 117 */
 118static mntopt_t mntopts[] = {
 119	{ MNTOPT_NOXATTR, noxattr_cancel, NULL, 0, NULL },
 120	{ MNTOPT_XATTR, xattr_cancel, NULL, 0, NULL },
 121	{ MNTOPT_NOATIME, noatime_cancel, NULL, 0, NULL },
 122	{ MNTOPT_ATIME, atime_cancel, NULL, 0, NULL }
 123};
 124
 125static mntopts_t zfs_mntopts = {
 126	sizeof (mntopts) / sizeof (mntopt_t),
 127	mntopts
 128};
 129
 130/*ARGSUSED*/
 131int
 132zfs_sync(vfs_t *vfsp, short flag, cred_t *cr)
 133{
 134	/*
 135	 * Data integrity is job one.  We don't want a compromised kernel
 136	 * writing to the storage pool, so we never sync during panic.
 137	 */
 138	if (panicstr)
 139		return (0);
 140
 141	/*
 142	 * SYNC_ATTR is used by fsflush() to force old filesystems like UFS
 143	 * to sync metadata, which they would otherwise cache indefinitely.
 144	 * Semantically, the only requirement is that the sync be initiated.
 145	 * The DMU syncs out txgs frequently, so there's nothing to do.
 146	 */
 147	if (flag & SYNC_ATTR)
 148		return (0);
 149
 150	if (vfsp != NULL) {
 151		/*
 152		 * Sync a specific filesystem.
 153		 */
 154		zfsvfs_t *zfsvfs = vfsp->vfs_data;
 155		dsl_pool_t *dp;
 156
 157		ZFS_ENTER(zfsvfs);
 158		dp = dmu_objset_pool(zfsvfs->z_os);
 159
 160		/*
 161		 * If the system is shutting down, then skip any
 162		 * filesystems which may exist on a suspended pool.
 163		 */
 164		if (sys_shutdown && spa_suspended(dp->dp_spa)) {
 165			ZFS_EXIT(zfsvfs);
 166			return (0);
 167		}
 168
 169		if (zfsvfs->z_log != NULL)
 170			zil_commit(zfsvfs->z_log, 0);
 171
 172		ZFS_EXIT(zfsvfs);
 173	} else {
 174		/*
 175		 * Sync all ZFS filesystems.  This is what happens when you
 176		 * run sync(1M).  Unlike other filesystems, ZFS honors the
 177		 * request by waiting for all pools to commit all dirty data.
 178		 */
 179		spa_sync_allpools();
 180	}
 181
 182	return (0);
 183}
 184
 185static int
 186zfs_create_unique_device(dev_t *dev)
 187{
 188	major_t new_major;
 189
 190	do {
 191		ASSERT3U(zfs_minor, <=, MAXMIN32);
 192		minor_t start = zfs_minor;
 193		do {
 194			mutex_enter(&zfs_dev_mtx);
 195			if (zfs_minor >= MAXMIN32) {
 196				/*
 197				 * If we're still using the real major
 198				 * keep out of /dev/zfs and /dev/zvol minor
 199				 * number space.  If we're using a getudev()'ed
 200				 * major number, we can use all of its minors.
 201				 */
 202				if (zfs_major == ddi_name_to_major(ZFS_DRIVER))
 203					zfs_minor = ZFS_MIN_MINOR;
 204				else
 205					zfs_minor = 0;
 206			} else {
 207				zfs_minor++;
 208			}
 209			*dev = makedevice(zfs_major, zfs_minor);
 210			mutex_exit(&zfs_dev_mtx);
 211		} while (vfs_devismounted(*dev) && zfs_minor != start);
 212		if (zfs_minor == start) {
 213			/*
 214			 * We are using all ~262,000 minor numbers for the
 215			 * current major number.  Create a new major number.
 216			 */
 217			if ((new_major = getudev()) == (major_t)-1) {
 218				cmn_err(CE_WARN,
 219				    "zfs_mount: Can't get unique major "
 220				    "device number.");
 221				return (-1);
 222			}
 223			mutex_enter(&zfs_dev_mtx);
 224			zfs_major = new_major;
 225			zfs_minor = 0;
 226
 227			mutex_exit(&zfs_dev_mtx);
 228		} else {
 229			break;
 230		}
 231		/* CONSTANTCONDITION */
 232	} while (1);
 233
 234	return (0);
 235}
 236
 237static void
 238atime_changed_cb(void *arg, uint64_t newval)
 239{
 240	zfsvfs_t *zfsvfs = arg;
 241
 242	if (newval == TRUE) {
 243		zfsvfs->z_atime = TRUE;
 244		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME);
 245		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0);
 246	} else {
 247		zfsvfs->z_atime = FALSE;
 248		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME);
 249		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0);
 250	}
 251}
 252
 253static void
 254xattr_changed_cb(void *arg, uint64_t newval)
 255{
 256	zfsvfs_t *zfsvfs = arg;
 257
 258	if (newval == TRUE) {
 259		/* XXX locking on vfs_flag? */
 260		zfsvfs->z_vfs->vfs_flag |= VFS_XATTR;
 261		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR);
 262		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_XATTR, NULL, 0);
 263	} else {
 264		/* XXX locking on vfs_flag? */
 265		zfsvfs->z_vfs->vfs_flag &= ~VFS_XATTR;
 266		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_XATTR);
 267		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR, NULL, 0);
 268	}
 269}
 270
 271static void
 272blksz_changed_cb(void *arg, uint64_t newval)
 273{
 274	zfsvfs_t *zfsvfs = arg;
 275
 276	if (newval < SPA_MINBLOCKSIZE ||
 277	    newval > SPA_MAXBLOCKSIZE || !ISP2(newval))
 278		newval = SPA_MAXBLOCKSIZE;
 279
 280	zfsvfs->z_max_blksz = newval;
 281	zfsvfs->z_vfs->vfs_bsize = newval;
 282}
 283
 284static void
 285readonly_changed_cb(void *arg, uint64_t newval)
 286{
 287	zfsvfs_t *zfsvfs = arg;
 288
 289	if (newval) {
 290		/* XXX locking on vfs_flag? */
 291		zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
 292		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW);
 293		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0);
 294	} else {
 295		/* XXX locking on vfs_flag? */
 296		zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
 297		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO);
 298		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0);
 299	}
 300}
 301
 302static void
 303devices_changed_cb(void *arg, uint64_t newval)
 304{
 305	zfsvfs_t *zfsvfs = arg;
 306
 307	if (newval == FALSE) {
 308		zfsvfs->z_vfs->vfs_flag |= VFS_NODEVICES;
 309		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_DEVICES);
 310		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NODEVICES, NULL, 0);
 311	} else {
 312		zfsvfs->z_vfs->vfs_flag &= ~VFS_NODEVICES;
 313		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NODEVICES);
 314		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_DEVICES, NULL, 0);
 315	}
 316}
 317
 318static void
 319setuid_changed_cb(void *arg, uint64_t newval)
 320{
 321	zfsvfs_t *zfsvfs = arg;
 322
 323	if (newval == FALSE) {
 324		zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID;
 325		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID);
 326		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0);
 327	} else {
 328		zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID;
 329		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID);
 330		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0);
 331	}
 332}
 333
 334static void
 335exec_changed_cb(void *arg, uint64_t newval)
 336{
 337	zfsvfs_t *zfsvfs = arg;
 338
 339	if (newval == FALSE) {
 340		zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC;
 341		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC);
 342		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0);
 343	} else {
 344		zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC;
 345		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC);
 346		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0);
 347	}
 348}
 349
 350/*
 351 * The nbmand mount option can be changed at mount time.
 352 * We can't allow it to be toggled on live file systems or incorrect
 353 * behavior may be seen from cifs clients
 354 *
 355 * This property isn't registered via dsl_prop_register(), but this callback
 356 * will be called when a file system is first mounted
 357 */
 358static void
 359nbmand_changed_cb(void *arg, uint64_t newval)
 360{
 361	zfsvfs_t *zfsvfs = arg;
 362	if (newval == FALSE) {
 363		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND);
 364		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0);
 365	} else {
 366		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND);
 367		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0);
 368	}
 369}
 370
 371static void
 372snapdir_changed_cb(void *arg, uint64_t newval)
 373{
 374	zfsvfs_t *zfsvfs = arg;
 375
 376	zfsvfs->z_show_ctldir = newval;
 377}
 378
 379static void
 380vscan_changed_cb(void *arg, uint64_t newval)
 381{
 382	zfsvfs_t *zfsvfs = arg;
 383
 384	zfsvfs->z_vscan = newval;
 385}
 386
 387static void
 388acl_mode_changed_cb(void *arg, uint64_t newval)
 389{
 390	zfsvfs_t *zfsvfs = arg;
 391
 392	zfsvfs->z_acl_mode = newval;
 393}
 394
 395static void
 396acl_inherit_changed_cb(void *arg, uint64_t newval)
 397{
 398	zfsvfs_t *zfsvfs = arg;
 399
 400	zfsvfs->z_acl_inherit = newval;
 401}
 402
 403static int
 404zfs_register_callbacks(vfs_t *vfsp)
 405{
 406	struct dsl_dataset *ds = NULL;
 407	objset_t *os = NULL;
 408	zfsvfs_t *zfsvfs = NULL;
 409	uint64_t nbmand;
 410	int readonly, do_readonly = B_FALSE;
 411	int setuid, do_setuid = B_FALSE;
 412	int exec, do_exec = B_FALSE;
 413	int devices, do_devices = B_FALSE;
 414	int xattr, do_xattr = B_FALSE;
 415	int atime, do_atime = B_FALSE;
 416	int error = 0;
 417
 418	ASSERT(vfsp);
 419	zfsvfs = vfsp->vfs_data;
 420	ASSERT(zfsvfs);
 421	os = zfsvfs->z_os;
 422
 423	/*
 424	 * The act of registering our callbacks will destroy any mount
 425	 * options we may have.  In order to enable temporary overrides
 426	 * of mount options, we stash away the current values and
 427	 * restore them after we register the callbacks.
 428	 */
 429	if (vfs_optionisset(vfsp, MNTOPT_RO, NULL) ||
 430	    !spa_writeable(dmu_objset_spa(os))) {
 431		readonly = B_TRUE;
 432		do_readonly = B_TRUE;
 433	} else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
 434		readonly = B_FALSE;
 435		do_readonly = B_TRUE;
 436	}
 437	if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
 438		devices = B_FALSE;
 439		setuid = B_FALSE;
 440		do_devices = B_TRUE;
 441		do_setuid = B_TRUE;
 442	} else {
 443		if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) {
 444			devices = B_FALSE;
 445			do_devices = B_TRUE;
 446		} else if (vfs_optionisset(vfsp, MNTOPT_DEVICES, NULL)) {
 447			devices = B_TRUE;
 448			do_devices = B_TRUE;
 449		}
 450
 451		if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) {
 452			setuid = B_FALSE;
 453			do_setuid = B_TRUE;
 454		} else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) {
 455			setuid = B_TRUE;
 456			do_setuid = B_TRUE;
 457		}
 458	}
 459	if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) {
 460		exec = B_FALSE;
 461		do_exec = B_TRUE;
 462	} else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) {
 463		exec = B_TRUE;
 464		do_exec = B_TRUE;
 465	}
 466	if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) {
 467		xattr = B_FALSE;
 468		do_xattr = B_TRUE;
 469	} else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) {
 470		xattr = B_TRUE;
 471		do_xattr = B_TRUE;
 472	}
 473	if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) {
 474		atime = B_FALSE;
 475		do_atime = B_TRUE;
 476	} else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) {
 477		atime = B_TRUE;
 478		do_atime = B_TRUE;
 479	}
 480
 481	/*
 482	 * nbmand is a special property.  It can only be changed at
 483	 * mount time.
 484	 *
 485	 * This is weird, but it is documented to only be changeable
 486	 * at mount time.
 487	 */
 488	if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) {
 489		nbmand = B_FALSE;
 490	} else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) {
 491		nbmand = B_TRUE;
 492	} else {
 493		char osname[MAXNAMELEN];
 494
 495		dmu_objset_name(os, osname);
 496		if (error = dsl_prop_get_integer(osname, "nbmand", &nbmand,
 497		    NULL)) {
 498			return (error);
 499		}
 500	}
 501
 502	/*
 503	 * Register property callbacks.
 504	 *
 505	 * It would probably be fine to just check for i/o error from
 506	 * the first prop_register(), but I guess I like to go
 507	 * overboard...
 508	 */
 509	ds = dmu_objset_ds(os);
 510	error = dsl_prop_register(ds, "atime", atime_changed_cb, zfsvfs);
 511	error = error ? error : dsl_prop_register(ds,
 512	    "xattr", xattr_changed_cb, zfsvfs);
 513	error = error ? error : dsl_prop_register(ds,
 514	    "recordsize", blksz_changed_cb, zfsvfs);
 515	error = error ? error : dsl_prop_register(ds,
 516	    "readonly", readonly_changed_cb, zfsvfs);
 517	error = error ? error : dsl_prop_register(ds,
 518	    "devices", devices_changed_cb, zfsvfs);
 519	error = error ? error : dsl_prop_register(ds,
 520	    "setuid", setuid_changed_cb, zfsvfs);
 521	error = error ? error : dsl_prop_register(ds,
 522	    "exec", exec_changed_cb, zfsvfs);
 523	error = error ? error : dsl_prop_register(ds,
 524	    "snapdir", snapdir_changed_cb, zfsvfs);
 525	error = error ? error : dsl_prop_register(ds,
 526	    "aclmode", acl_mode_changed_cb, zfsvfs);
 527	error = error ? error : dsl_prop_register(ds,
 528	    "aclinherit", acl_inherit_changed_cb, zfsvfs);
 529	error = error ? error : dsl_prop_register(ds,
 530	    "vscan", vscan_changed_cb, zfsvfs);
 531	if (error)
 532		goto unregister;
 533
 534	/*
 535	 * Invoke our callbacks to restore temporary mount options.
 536	 */
 537	if (do_readonly)
 538		readonly_changed_cb(zfsvfs, readonly);
 539	if (do_setuid)
 540		setuid_changed_cb(zfsvfs, setuid);
 541	if (do_exec)
 542		exec_changed_cb(zfsvfs, exec);
 543	if (do_devices)
 544		devices_changed_cb(zfsvfs, devices);
 545	if (do_xattr)
 546		xattr_changed_cb(zfsvfs, xattr);
 547	if (do_atime)
 548		atime_changed_cb(zfsvfs, atime);
 549
 550	nbmand_changed_cb(zfsvfs, nbmand);
 551
 552	return (0);
 553
 554unregister:
 555	/*
 556	 * We may attempt to unregister some callbacks that are not
 557	 * registered, but this is OK; it will simply return ENOMSG,
 558	 * which we will ignore.
 559	 */
 560	(void) dsl_prop_unregister(ds, "atime", atime_changed_cb, zfsvfs);
 561	(void) dsl_prop_unregister(ds, "xattr", xattr_changed_cb, zfsvfs);
 562	(void) dsl_prop_unregister(ds, "recordsize", blksz_changed_cb, zfsvfs);
 563	(void) dsl_prop_unregister(ds, "readonly", readonly_changed_cb, zfsvfs);
 564	(void) dsl_prop_unregister(ds, "devices", devices_changed_cb, zfsvfs);
 565	(void) dsl_prop_unregister(ds, "setuid", setuid_changed_cb, zfsvfs);
 566	(void) dsl_prop_unregister(ds, "exec", exec_changed_cb, zfsvfs);
 567	(void) dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb, zfsvfs);
 568	(void) dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb, zfsvfs);
 569	(void) dsl_prop_unregister(ds, "aclinherit", acl_inherit_changed_cb,
 570	    zfsvfs);
 571	(void) dsl_prop_unregister(ds, "vscan", vscan_changed_cb, zfsvfs);
 572	return (error);
 573
 574}
 575
 576static int
 577zfs_space_delta_cb(dmu_object_type_t bonustype, void *data,
 578    uint64_t *userp, uint64_t *groupp)
 579{
 580	znode_phys_t *znp = data;
 581	int error = 0;
 582
 583	/*
 584	 * Is it a valid type of object to track?
 585	 */
 586	if (bonustype != DMU_OT_ZNODE && bonustype != DMU_OT_SA)
 587		return (ENOENT);
 588
 589	/*
 590	 * If we have a NULL data pointer
 591	 * then assume the id's aren't changing and
 592	 * return EEXIST to the dmu to let it know to
 593	 * use the same ids
 594	 */
 595	if (data == NULL)
 596		return (EEXIST);
 597
 598	if (bonustype == DMU_OT_ZNODE) {
 599		*userp = znp->zp_uid;
 600		*groupp = znp->zp_gid;
 601	} else {
 602		int hdrsize;
 603
 604		ASSERT(bonustype == DMU_OT_SA);
 605		hdrsize = sa_hdrsize(data);
 606
 607		if (hdrsize != 0) {
 608			*userp = *((uint64_t *)((uintptr_t)data + hdrsize +
 609			    SA_UID_OFFSET));
 610			*groupp = *((uint64_t *)((uintptr_t)data + hdrsize +
 611			    SA_GID_OFFSET));
 612		} else {
 613			/*
 614			 * This should only happen for newly created
 615			 * files that haven't had the znode data filled
 616			 * in yet.
 617			 */
 618			*userp = 0;
 619			*groupp = 0;
 620		}
 621	}
 622	return (error);
 623}
 624
 625static void
 626fuidstr_to_sid(zfsvfs_t *zfsvfs, const char *fuidstr,
 627    char *domainbuf, int buflen, uid_t *ridp)
 628{
 629	uint64_t fuid;
 630	const char *domain;
 631
 632	fuid = strtonum(fuidstr, NULL);
 633
 634	domain = zfs_fuid_find_by_idx(zfsvfs, FUID_INDEX(fuid));
 635	if (domain)
 636		(void) strlcpy(domainbuf, domain, buflen);
 637	else
 638		domainbuf[0] = '\0';
 639	*ridp = FUID_RID(fuid);
 640}
 641
 642static uint64_t
 643zfs_userquota_prop_to_obj(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type)
 644{
 645	switch (type) {
 646	case ZFS_PROP_USERUSED:
 647		return (DMU_USERUSED_OBJECT);
 648	case ZFS_PROP_GROUPUSED:
 649		return (DMU_GROUPUSED_OBJECT);
 650	case ZFS_PROP_USERQUOTA:
 651		return (zfsvfs->z_userquota_obj);
 652	case ZFS_PROP_GROUPQUOTA:
 653		return (zfsvfs->z_groupquota_obj);
 654	}
 655	return (0);
 656}
 657
 658int
 659zfs_userspace_many(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
 660    uint64_t *cookiep, void *vbuf, uint64_t *bufsizep)
 661{
 662	int error;
 663	zap_cursor_t zc;
 664	zap_attribute_t za;
 665	zfs_useracct_t *buf = vbuf;
 666	uint64_t obj;
 667
 668	if (!dmu_objset_userspace_present(zfsvfs->z_os))
 669		return (ENOTSUP);
 670
 671	obj = zfs_userquota_prop_to_obj(zfsvfs, type);
 672	if (obj == 0) {
 673		*bufsizep = 0;
 674		return (0);
 675	}
 676
 677	for (zap_cursor_init_serialized(&zc, zfsvfs->z_os, obj, *cookiep);
 678	    (error = zap_cursor_retrieve(&zc, &za)) == 0;
 679	    zap_cursor_advance(&zc)) {
 680		if ((uintptr_t)buf - (uintptr_t)vbuf + sizeof (zfs_useracct_t) >
 681		    *bufsizep)
 682			break;
 683
 684		fuidstr_to_sid(zfsvfs, za.za_name,
 685		    buf->zu_domain, sizeof (buf->zu_domain), &buf->zu_rid);
 686
 687		buf->zu_space = za.za_first_integer;
 688		buf++;
 689	}
 690	if (error == ENOENT)
 691		error = 0;
 692
 693	ASSERT3U((uintptr_t)buf - (uintptr_t)vbuf, <=, *bufsizep);
 694	*bufsizep = (uintptr_t)buf - (uintptr_t)vbuf;
 695	*cookiep = zap_cursor_serialize(&zc);
 696	zap_cursor_fini(&zc);
 697	return (error);
 698}
 699
 700/*
 701 * buf must be big enough (eg, 32 bytes)
 702 */
 703static int
 704id_to_fuidstr(zfsvfs_t *zfsvfs, const char *domain, uid_t rid,
 705    char *buf, boolean_t addok)
 706{
 707	uint64_t fuid;
 708	int domainid = 0;
 709
 710	if (domain && domain[0]) {
 711		domainid = zfs_fuid_find_by_domain(zfsvfs, domain, NULL, addok);
 712		if (domainid == -1)
 713			return (ENOENT);
 714	}
 715	fuid = FUID_ENCODE(domainid, rid);
 716	(void) sprintf(buf, "%llx", (longlong_t)fuid);
 717	return (0);
 718}
 719
 720int
 721zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
 722    const char *domain, uint64_t rid, uint64_t *valp)
 723{
 724	char buf[32];
 725	int err;
 726	uint64_t obj;
 727
 728	*valp = 0;
 729
 730	if (!dmu_objset_userspace_present(zfsvfs->z_os))
 731		return (ENOTSUP);
 732
 733	obj = zfs_userquota_prop_to_obj(zfsvfs, type);
 734	if (obj == 0)
 735		return (0);
 736
 737	err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_FALSE);
 738	if (err)
 739		return (err);
 740
 741	err = zap_lookup(zfsvfs->z_os, obj, buf, 8, 1, valp);
 742	if (err == ENOENT)
 743		err = 0;
 744	return (err);
 745}
 746
 747int
 748zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
 749    const char *domain, uint64_t rid, uint64_t quota)
 750{
 751	char buf[32];
 752	int err;
 753	dmu_tx_t *tx;
 754	uint64_t *objp;
 755	boolean_t fuid_dirtied;
 756
 757	if (type != ZFS_PROP_USERQUOTA && type != ZFS_PROP_GROUPQUOTA)
 758		return (EINVAL);
 759
 760	if (zfsvfs->z_version < ZPL_VERSION_USERSPACE)
 761		return (ENOTSUP);
 762
 763	objp = (type == ZFS_PROP_USERQUOTA) ? &zfsvfs->z_userquota_obj :
 764	    &zfsvfs->z_groupquota_obj;
 765
 766	err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_TRUE);
 767	if (err)
 768		return (err);
 769	fuid_dirtied = zfsvfs->z_fuid_dirty;
 770
 771	tx = dmu_tx_create(zfsvfs->z_os);
 772	dmu_tx_hold_zap(tx, *objp ? *objp : DMU_NEW_OBJECT, B_TRUE, NULL);
 773	if (*objp == 0) {
 774		dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE,
 775		    zfs_userquota_prop_prefixes[type]);
 776	}
 777	if (fuid_dirtied)
 778		zfs_fuid_txhold(zfsvfs, tx);
 779	err = dmu_tx_assign(tx, TXG_WAIT);
 780	if (err) {
 781		dmu_tx_abort(tx);
 782		return (err);
 783	}
 784
 785	mutex_enter(&zfsvfs->z_lock);
 786	if (*objp == 0) {
 787		*objp = zap_create(zfsvfs->z_os, DMU_OT_USERGROUP_QUOTA,
 788		    DMU_OT_NONE, 0, tx);
 789		VERIFY(0 == zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
 790		    zfs_userquota_prop_prefixes[type], 8, 1, objp, tx));
 791	}
 792	mutex_exit(&zfsvfs->z_lock);
 793
 794	if (quota == 0) {
 795		err = zap_remove(zfsvfs->z_os, *objp, buf, tx);
 796		if (err == ENOENT)
 797			err = 0;
 798	} else {
 799		err = zap_update(zfsvfs->z_os, *objp, buf, 8, 1, &quota, tx);
 800	}
 801	ASSERT(err == 0);
 802	if (fuid_dirtied)
 803		zfs_fuid_sync(zfsvfs, tx);
 804	dmu_tx_commit(tx);
 805	return (err);
 806}
 807
 808boolean_t
 809zfs_fuid_overquota(zfsvfs_t *zfsvfs, boolean_t isgroup, uint64_t fuid)
 810{
 811	char buf[32];
 812	uint64_t used, quota, usedobj, quotaobj;
 813	int err;
 814
 815	usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT;
 816	quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj;
 817
 818	if (quotaobj == 0 || zfsvfs->z_replay)
 819		return (B_FALSE);
 820
 821	(void) sprintf(buf, "%llx", (longlong_t)fuid);
 822	err = zap_lookup(zfsvfs->z_os, quotaobj, buf, 8, 1, &quota);
 823	if (err != 0)
 824		return (B_FALSE);
 825
 826	err = zap_lookup(zfsvfs->z_os, usedobj, buf, 8, 1, &used);
 827	if (err != 0)
 828		return (B_FALSE);
 829	return (used >= quota);
 830}
 831
 832boolean_t
 833zfs_owner_overquota(zfsvfs_t *zfsvfs, znode_t *zp, boolean_t isgroup)
 834{
 835	uint64_t fuid;
 836	uint64_t quotaobj;
 837
 838	quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj;
 839
 840	fuid = isgroup ? zp->z_gid : zp->z_uid;
 841
 842	if (quotaobj == 0 || zfsvfs->z_replay)
 843		return (B_FALSE);
 844
 845	return (zfs_fuid_overquota(zfsvfs, isgroup, fuid));
 846}
 847
 848int
 849zfsvfs_create(const char *osname, zfsvfs_t **zfvp)
 850{
 851	objset_t *os;
 852	zfsvfs_t *zfsvfs;
 853	uint64_t zval;
 854	int i, error;
 855	uint64_t sa_obj;
 856
 857	zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
 858
 859	/*
 860	 * We claim to always be readonly so we can open snapshots;
 861	 * other ZPL code will prevent us from writing to snapshots.
 862	 */
 863	error = dmu_objset_own(osname, DMU_OST_ZFS, B_TRUE, zfsvfs, &os);
 864	if (error) {
 865		kmem_free(zfsvfs, sizeof (zfsvfs_t));
 866		return (error);
 867	}
 868
 869	/*
 870	 * Initialize the zfs-specific filesystem structure.
 871	 * Should probably make this a kmem cache, shuffle fields,
 872	 * and just bzero up to z_hold_mtx[].
 873	 */
 874	zfsvfs->z_vfs = NULL;
 875	zfsvfs->z_parent = zfsvfs;
 876	zfsvfs->z_max_blksz = SPA_MAXBLOCKSIZE;
 877	zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
 878	zfsvfs->z_os = os;
 879
 880	error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version);
 881	if (error) {
 882		goto out;
 883	} else if (zfsvfs->z_version >
 884	    zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) {
 885		(void) printf("Can't mount a version %lld file system "
 886		    "on a version %lld pool\n. Pool must be upgraded to mount "
 887		    "this file system.", (u_longlong_t)zfsvfs->z_version,
 888		    (u_longlong_t)spa_version(dmu_objset_spa(os)));
 889		error = ENOTSUP;
 890		goto out;
 891	}
 892	if ((error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &zval)) != 0)
 893		goto out;
 894	zfsvfs->z_norm = (int)zval;
 895
 896	if ((error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &zval)) != 0)
 897		goto out;
 898	zfsvfs->z_utf8 = (zval != 0);
 899
 900	if ((error = zfs_get_zplprop(os, ZFS_PROP_CASE, &zval)) != 0)
 901		goto out;
 902	zfsvfs->z_case = (uint_t)zval;
 903
 904	/*
 905	 * Fold case on file systems that are always or sometimes case
 906	 * insensitive.
 907	 */
 908	if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
 909	    zfsvfs->z_case == ZFS_CASE_MIXED)
 910		zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
 911
 912	zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
 913	zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
 914
 915	if (zfsvfs->z_use_sa) {
 916		/* should either have both of these objects or none */
 917		error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1,
 918		    &sa_obj);
 919		if (error)
 920			return (error);
 921	} else {
 922		/*
 923		 * Pre SA versions file systems should never touch
 924		 * either the attribute registration or layout objects.
 925		 */
 926		sa_obj = 0;
 927	}
 928
 929	error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
 930	    &zfsvfs->z_attr_table);
 931	if (error)
 932		goto out;
 933
 934	if (zfsvfs->z_version >= ZPL_VERSION_SA)
 935		sa_register_update_callback(os, zfs_sa_upgrade);
 936
 937	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1,
 938	    &zfsvfs->z_root);
 939	if (error)
 940		goto out;
 941	ASSERT(zfsvfs->z_root != 0);
 942
 943	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1,
 944	    &zfsvfs->z_unlinkedobj);
 945	if (error)
 946		goto out;
 947
 948	error = zap_lookup(os, MASTER_NODE_OBJ,
 949	    zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA],
 950	    8, 1, &zfsvfs->z_userquota_obj);
 951	if (error && error != ENOENT)
 952		goto out;
 953
 954	error = zap_lookup(os, MASTER_NODE_OBJ,
 955	    zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA],
 956	    8, 1, &zfsvfs->z_groupquota_obj);
 957	if (error && error != ENOENT)
 958		goto out;
 959
 960	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1,
 961	    &zfsvfs->z_fuid_obj);
 962	if (error && error != ENOENT)
 963		goto out;
 964
 965	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1,
 966	    &zfsvfs->z_shares_dir);
 967	if (error && error != ENOENT)
 968		goto out;
 969
 970	mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
 971	mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL);
 972	list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
 973	    offsetof(znode_t, z_link_node));
 974	rrw_init(&zfsvfs->z_teardown_lock);
 975	rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL);
 976	rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL);
 977	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
 978		mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
 979
 980	*zfvp = zfsvfs;
 981	return (0);
 982
 983out:
 984	dmu_objset_disown(os, zfsvfs);
 985	*zfvp = NULL;
 986	kmem_free(zfsvfs, sizeof (zfsvfs_t));
 987	return (error);
 988}
 989
 990static int
 991zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
 992{
 993	int error;
 994
 995	error = zfs_register_callbacks(zfsvfs->z_vfs);
 996	if (error)
 997		return (error);
 998
 999	/*
1000	 * Set the objset user_ptr to track its zfsvfs.
1001	 */
1002	mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
1003	dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
1004	mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
1005
1006	zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);
1007
1008	/*
1009	 * If we are not mounting (ie: online recv), then we don't
1010	 * have to worry about replaying the log as we blocked all
1011	 * operations out since we closed the ZIL.
1012	 */
1013	if (mounting) {
1014		boolean_t readonly;
1015
1016		/*
1017		 * During replay we remove the read only flag to
1018		 * allow replays to succeed.
1019		 */
1020		readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY;
1021		if (readonly != 0)
1022			zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
1023		else
1024			zfs_unlinked_drain(zfsvfs);
1025
1026		/*
1027		 * Parse and replay the intent log.
1028		 *
1029		 * Because of ziltest, this must be done after
1030		 * zfs_unlinked_drain().  (Further note: ziltest
1031		 * doesn't use readonly mounts, where
1032		 * zfs_unlinked_drain() isn't called.)  This is because
1033		 * ziltest causes spa_sync() to think it's committed,
1034		 * but actually it is not, so the intent log contains
1035		 * many txg's worth of changes.
1036		 *
1037		 * In particular, if object N is in the unlinked set in
1038		 * the last txg to actually sync, then it could be
1039		 * actually freed in a later txg and then reallocated
1040		 * in a yet later txg.  This would write a "create
1041		 * object N" record to the intent log.  Normally, this
1042		 * would be fine because the spa_sync() would have
1043		 * written out the fact that object N is free, before
1044		 * we could write the "create object N" intent log
1045		 * record.
1046		 *
1047		 * But when we are in ziltest mode, we advance the "open
1048		 * txg" without actually spa_sync()-ing the changes to
1049		 * disk.  So we would see that object N is still
1050		 * allocated and in the unlinked set, and there is an
1051		 * intent log record saying to allocate it.
1052		 */
1053		if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) {
1054			if (zil_replay_disable) {
1055				zil_destroy(zfsvfs->z_log, B_FALSE);
1056			} else {
1057				zfsvfs->z_replay = B_TRUE;
1058				zil_replay(zfsvfs->z_os, zfsvfs,
1059				    zfs_replay_vector);
1060				zfsvfs->z_replay = B_FALSE;
1061			}
1062		}
1063		zfsvfs->z_vfs->vfs_flag |= readonly; /* restore readonly bit */
1064	}
1065
1066	return (0);
1067}
1068
1069void
1070zfsvfs_free(zfsvfs_t *zfsvfs)
1071{
1072	int i;
1073	extern krwlock_t zfsvfs_lock; /* in zfs_znode.c */
1074
1075	/*
1076	 * This is a barrier to prevent the filesystem from going away in
1077	 * zfs_znode_move() until we can safely ensure that the filesystem is
1078	 * not unmounted. We consider the filesystem valid before the barrier
1079	 * and invalid after the barrier.
1080	 */
1081	rw_enter(&zfsvfs_lock, RW_READER);
1082	rw_exit(&zfsvfs_lock);
1083
1084	zfs_fuid_destroy(zfsvfs);
1085
1086	mutex_destroy(&zfsvfs->z_znodes_lock);
1087	mutex_destroy(&zfsvfs->z_lock);
1088	list_destroy(&zfsvfs->z_all_znodes);
1089	rrw_destroy(&zfsvfs->z_teardown_lock);
1090	rw_destroy(&zfsvfs->z_teardown_inactive_lock);
1091	rw_destroy(&zfsvfs->z_fuid_lock);
1092	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
1093		mutex_destroy(&zfsvfs->z_hold_mtx[i]);
1094	kmem_free(zfsvfs, sizeof (zfsvfs_t));
1095}
1096
1097static void
1098zfs_set_fuid_feature(zfsvfs_t *zfsvfs)
1099{
1100	zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
1101	if (zfsvfs->z_vfs) {
1102		if (zfsvfs->z_use_fuids) {
1103			vfs_set_feature(zfsvfs->z_vfs, VFSFT_XVATTR);
1104			vfs_set_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS);
1105			vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS);
1106			vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE);
1107			vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER);
1108			vfs_set_feature(zfsvfs->z_vfs, VFSFT_REPARSE);
1109		} else {
1110			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_XVATTR);
1111			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS);
1112			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS);
1113			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE);
1114			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER);
1115			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_REPARSE);
1116		}
1117	}
1118	zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
1119}
1120
1121static int
1122zfs_domount(vfs_t *vfsp, char *osname)
1123{
1124	dev_t mount_dev;
1125	uint64_t recordsize, fsid_guid;
1126	int error = 0;
1127	zfsvfs_t *zfsvfs;
1128
1129	ASSERT(vfsp);
1130	ASSERT(osname);
1131
1132	error = zfsvfs_create(osname, &zfsvfs);
1133	if (error)
1134		return (error);
1135	zfsvfs->z_vfs = vfsp;
1136
1137	/* Initialize the generic filesystem structure. */
1138	vfsp->vfs_bcount = 0;
1139	vfsp->vfs_data = NULL;
1140
1141	if (zfs_create_unique_device(&mount_dev) == -1) {
1142		error = ENODEV;
1143		goto out;
1144	}
1145	ASSERT(vfs_devismounted(mount_dev) == 0);
1146
1147	if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize,
1148	    NULL))
1149		goto out;
1150
1151	vfsp->vfs_dev = mount_dev;
1152	vfsp->vfs_fstype = zfsfstype;
1153	vfsp->vfs_bsize = recordsize;
1154	vfsp->vfs_flag |= VFS_NOTRUNC;
1155	vfsp->vfs_data = zfsvfs;
1156
1157	/*
1158	 * The fsid is 64 bits, composed of an 8-bit fs type, which
1159	 * separates our fsid from any other filesystem types, and a
1160	 * 56-bit objset unique ID.  The objset unique ID is unique to
1161	 * all objsets open on this system, provided by unique_create().
1162	 * The 8-bit fs type must be put in the low bits of fsid[1]
1163	 * because that's where other Solaris filesystems put it.
1164	 */
1165	fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os);
1166	ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0);
1167	vfsp->vfs_fsid.val[0] = fsid_guid;
1168	vfsp->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) |
1169	    zfsfstype & 0xFF;
1170
1171	/*
1172	 * Set features for file system.
1173	 */
1174	zfs_set_fuid_feature(zfsvfs);
1175	if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
1176		vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
1177		vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
1178		vfs_set_feature(vfsp, VFSFT_NOCASESENSITIVE);
1179	} else if (zfsvfs->z_case == ZFS_CASE_MIXED) {
1180		vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
1181		vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
1182	}
1183	vfs_set_feature(vfsp, VFSFT_ZEROCOPY_SUPPORTED);
1184
1185	if (dmu_objset_is_snapshot(zfsvfs->z_os)) {
1186		uint64_t pval;
1187
1188		atime_changed_cb(zfsvfs, B_FALSE);
1189		readonly_changed_cb(zfsvfs, B_TRUE);
1190		if (error = dsl_prop_get_integer(osname, "xattr", &pval, NULL))
1191			goto out;
1192		xattr_changed_cb(zfsvfs, pval);
1193		zfsvfs->z_issnap = B_TRUE;
1194		zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED;
1195
1196		mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
1197		dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
1198		mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
1199	} else {
1200		error = zfsvfs_setup(zfsvfs, B_TRUE);
1201	}
1202
1203	if (!zfsvfs->z_issnap)
1204		zfsctl_create(zfsvfs);
1205out:
1206	if (error) {
1207		dmu_objset_disown(zfsvfs->z_os, zfsvfs);
1208		zfsvfs_free(zfsvfs);
1209	} else {
1210		atomic_add_32(&zfs_active_fs_count, 1);
1211	}
1212
1213	return (error);
1214}
1215
1216void
1217zfs_unregister_callbacks(zfsvfs_t *zfsvfs)
1218{
1219	objset_t *os = zfsvfs->z_os;
1220	struct dsl_dataset *ds;
1221
1222	/*
1223	 * Unregister properties.
1224	 */
1225	if (!dmu_objset_is_snapshot(os)) {
1226		ds = dmu_objset_ds(os);
1227		VERIFY(dsl_prop_unregister(ds, "atime", atime_changed_cb,
1228		    zfsvfs) == 0);
1229
1230		VERIFY(dsl_prop_unregister(ds, "xattr", xattr_changed_cb,
1231		    zfsvfs) == 0);
1232
1233		VERIFY(dsl_prop_unregister(ds, "recordsize", blksz_changed_cb,
1234		    zfsvfs) == 0);
1235
1236		VERIFY(dsl_prop_unregister(ds, "readonly", readonly_changed_cb,
1237		    zfsvfs) == 0);
1238
1239		VERIFY(dsl_prop_unregister(ds, "devices", devices_changed_cb,
1240		    zfsvfs) == 0);
1241
1242		VERIFY(dsl_prop_unregister(ds, "setuid", setuid_changed_cb,
1243		    zfsvfs) == 0);
1244
1245		VERIFY(dsl_prop_unregister(ds, "exec", exec_changed_cb,
1246		    zfsvfs) == 0);
1247
1248		VERIFY(dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb,
1249		    zfsvfs) == 0);
1250
1251		VERIFY(dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb,
1252		    zfsvfs) == 0);
1253
1254		VERIFY(dsl_prop_unregister(ds, "aclinherit",
1255		    acl_inherit_changed_cb, zfsvfs) == 0);
1256
1257		VERIFY(dsl_prop_unregister(ds, "vscan",
1258		    vscan_changed_cb, zfsvfs) == 0);
1259	}
1260}
1261
1262/*
1263 * Convert a decimal digit string to a uint64_t integer.
1264 */
1265static int
1266str_to_uint64(char *str, uint64_t *objnum)
1267{
1268	uint64_t num = 0;
1269
1270	while (*str) {
1271		if (*str < '0' || *str > '9')
1272			return (EINVAL);
1273
1274		num = num*10 + *str++ - '0';
1275	}
1276
1277	*objnum = num;
1278	return (0);
1279}
1280
1281/*
1282 * The boot path passed from the boot loader is in the form of
1283 * "rootpool-name/root-filesystem-object-number'. Convert this
1284 * string to a dataset name: "rootpool-name/root-filesystem-name".
1285 */
1286static int
1287zfs_parse_bootfs(char *bpath, char *outpath)
1288{
1289	char *slashp;
1290	uint64_t objnum;
1291	int error;
1292
1293	if (*bpath == 0 || *bpath == '/')
1294		return (EINVAL);
1295
1296	(void) strcpy(outpath, bpath);
1297
1298	slashp = strchr(bpath, '/');
1299
1300	/* if no '/', just return the pool name */
1301	if (slashp == NULL) {
1302		return (0);
1303	}
1304
1305	/* if not a number, just return the root dataset name */
1306	if (str_to_uint64(slashp+1, &objnum)) {
1307		return (0);
1308	}
1309
1310	*slashp = '\0';
1311	error = dsl_dsobj_to_dsname(bpath, objnum, outpath);
1312	*slashp = '/';
1313
1314	return (error);
1315}
1316
1317/*
1318 * zfs_check_global_label:
1319 *	Check that the hex label string is appropriate for the dataset
1320 *	being mounted into the global_zone proper.
1321 *
1322 *	Return an error if the hex label string is not default or
1323 *	admin_low/admin_high.  For admin_low labels, the corresponding
1324 *	dataset must be readonly.
1325 */
1326int
1327zfs_check_global_label(const char *dsname, const char *hexsl)
1328{
1329	if (strcasecmp(hexsl, ZFS_MLSLABEL_DEFAULT) == 0)
1330		return (0);
1331	if (strcasecmp(hexsl, ADMIN_HIGH) == 0)
1332		return (0);
1333	if (strcasecmp(hexsl, ADMIN_LOW) == 0) {
1334		/* must be readonly */
1335		uint64_t rdonly;
1336
1337		if (dsl_prop_get_integer(dsname,
1338		    zfs_prop_to_name(ZFS_PROP_READONLY), &rdonly, NULL))
1339			return (EACCES);
1340		return (rdonly ? 0 : EACCES);
1341	}
1342	return (EACCES);
1343}
1344
1345/*
1346 * zfs_mount_label_policy:
1347 *	Determine whether the mount is allowed according to MAC check.
1348 *	by comparing (where appropriate) label of the dataset against
1349 *	the label of the zone being mounted into.  If the dataset has
1350 *	no label, create one.
1351 *
1352 *	Returns:
1353 *		 0 :	access allowed
1354 *		>0 :	error code, such as EACCES
1355 */
1356static int
1357zfs_mount_label_policy(vfs_t *vfsp, char *osname)
1358{
1359	int		error, retv;
1360	zone_t		*mntzone = NULL;
1361	ts_label_t	*mnt_tsl;
1362	bslabel_t	*mnt_sl;
1363	bslabel_t	ds_sl;
1364	char		ds_hexsl[MAXNAMELEN];
1365
1366	retv = EACCES;				/* assume the worst */
1367
1368	/*
1369	 * Start by getting the dataset label if it exists.
1370	 */
1371	error = dsl_prop_get(osname, zfs_prop_to_name(ZFS_PROP_MLSLABEL),
1372	    1, sizeof (ds_hexsl), &ds_hexsl, NULL);
1373	if (error)
1374		return (EACCES);
1375
1376	/*
1377	 * If labeling is NOT enabled, then disallow the mount of datasets
1378	 * which have a non-default label already.  No other label checks
1379	 * are needed.
1380	 */
1381	if (!is_system_labeled()) {
1382		if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0)
1383			return (0);
1384		return (EACCES);
1385	}
1386
1387	/*
1388	 * Get the label of the mountpoint.  If mounting into the global
1389	 * zone (i.e. mountpoint is not within an active zone and the
1390	 * zoned property is off), the label must be default or
1391	 * admin_low/admin_high only; no other checks are needed.
1392	 */
1393	mntzone = zone_find_by_any_path(refstr_value(vfsp->vfs_mntpt), B_FALSE);
1394	if (mntzone->zone_id == GLOBAL_ZONEID) {
1395		uint64_t zoned;
1396
1397		zone_rele(mntzone);
1398
1399		if (dsl_prop_get_integer(osname,
1400		    zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL))
1401			return (EACCES);
1402		if (!zoned)
1403			return (zfs_check_global_label(osname, ds_hexsl));
1404		else
1405			/*
1406			 * This is the case of a zone dataset being mounted
1407			 * initially, before the zone has been fully created;
1408			 * allow this mount into global zone.
1409			 */
1410			return (0);
1411	}
1412
1413	mnt_tsl = mntzone->zone_slabel;
1414	ASSERT(mnt_tsl != NULL);
1415	label_hold(mnt_tsl);
1416	mnt_sl = label2bslabel(mnt_tsl);
1417
1418	if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0) {
1419		/*
1420		 * The dataset doesn't have a real label, so fabricate one.
1421		 */
1422		char *str = NULL;
1423
1424		if (l_to_str_internal(mnt_sl, &str) == 0 &&
1425		    dsl_prop_set(osname, zfs_prop_to_name(ZFS_PROP_MLSLABEL),
1426		    ZPROP_SRC_LOCAL, 1, strlen(str) + 1, str) == 0)
1427			retv = 0;
1428		if (str != NULL)
1429			kmem_free(str, strlen(str) + 1);
1430	} else if (hexstr_to_label(ds_hexsl, &ds_sl) == 0) {
1431		/*
1432		 * Now compare labels to complete the MAC check.  If the
1433		 * labels are equal then allow access.  If the mountpoint
1434		 * label dominates the dataset label, allow readonly access.
1435		 * Otherwise, access is denied.
1436		 */
1437		if (blequal(mnt_sl, &ds_sl))
1438			retv = 0;
1439		else if (bldominates(mnt_sl, &ds_sl)) {
1440			vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0);
1441			retv = 0;
1442		}
1443	}
1444
1445	label_rele(mnt_tsl);
1446	zone_rele(mntzone);
1447	return (retv);
1448}
1449
1450static int
1451zfs_mountroot(vfs_t *vfsp, enum whymountroot why)
1452{
1453	int error = 0;
1454	static int zfsrootdone = 0;
1455	zfsvfs_t *zfsvfs = NULL;
1456	znode_t *zp = NULL;
1457	vnode_t *vp = NULL;
1458	char *zfs_bootfs;
1459	char *zfs_devid;
1460
1461	ASSERT(vfsp);
1462
1463	/*
1464	 * The filesystem that we mount as root is defined in the
1465	 * boot property "zfs-bootfs" with a format of
1466	 * "poolname/root-dataset-objnum".
1467	 */
1468	if (why == ROOT_INIT) {
1469		if (zfsrootdone++)
1470			return (EBUSY);
1471		/*
1472		 * the process of doing a spa_load will require the
1473		 * clock to be set before we could (for example) do
1474		 * something better by looking at the timestamp on
1475		 * an uberblock, so just set it to -1.
1476		 */
1477		clkset(-1);
1478
1479		if ((zfs_bootfs = spa_get_bootprop("zfs-bootfs")) == NULL) {
1480			cmn_err(CE_NOTE, "spa_get_bootfs: can not get "
1481			    "bootfs name");
1482			return (EINVAL);
1483		}
1484		zfs_devid = spa_get_bootprop("diskdevid");
1485		error = spa_import_rootpool(rootfs.bo_name, zfs_devid);
1486		if (zfs_devid)
1487			spa_free_bootprop(zfs_devid);
1488		if (error) {
1489			spa_free_bootprop(zfs_bootfs);
1490			cmn_err(CE_NOTE, "spa_import_rootpool: error %d",
1491			    error);
1492			return (error);
1493		}
1494		if (error = zfs_parse_bootfs(zfs_bootfs, rootfs.bo_name)) {
1495			spa_free_bootprop(zfs_bootfs);
1496			cmn_err(CE_NOTE, "zfs_parse_bootfs: error %d",
1497			    error);
1498			return (error);
1499		}
1500
1501		spa_free_bootprop(zfs_bootfs);
1502
1503		if (error = vfs_lock(vfsp))
1504			return (error);
1505
1506		if (error = zfs_domount(vfsp, rootfs.bo_name)) {
1507			cmn_err(CE_NOTE, "zfs_domount: error %d", error);
1508			goto out;
1509		}
1510
1511		zfsvfs = (zfsvfs_t *)vfsp->vfs_data;
1512		ASSERT(zfsvfs);
1513		if (error = zfs_zget(zfsvfs, zfsvfs->z_root, &zp)) {
1514			cmn_err(CE_NOTE, "zfs_zget: error %d", error);
1515			goto out;
1516		}
1517
1518		vp = ZTOV(zp);
1519		mutex_enter(&vp->v_lock);
1520		vp->v_flag |= VROOT;
1521		mutex_exit(&vp->v_lock);
1522		rootvp = vp;
1523
1524		/*
1525		 * Leave rootvp held.  The root file system is never unmounted.
1526		 */
1527
1528		vfs_add((struct vnode *)0, vfsp,
1529		    (vfsp->vfs_flag & VFS_RDONLY) ? MS_RDONLY : 0);
1530out:
1531		vfs_unlock(vfsp);
1532		return (error);
1533	} else if (why == ROOT_REMOUNT) {
1534		readonly_changed_cb(vfsp->vfs_data, B_FALSE);
1535		vfsp->vfs_flag |= VFS_REMOUNT;
1536
1537		/* refresh mount options */
1538		zfs_unregister_callbacks(vfsp->vfs_data);
1539		return (zfs_register_callbacks(vfsp));
1540
1541	} else if (why == ROOT_UNMOUNT) {
1542		zfs_unregister_callbacks((zfsvfs_t *)vfsp->vfs_data);
1543		(void) zfs_sync(vfsp, 0, 0);
1544		return (0);
1545	}
1546
1547	/*
1548	 * if "why" is equal to anything else other than ROOT_INIT,
1549	 * ROOT_REMOUNT, or ROOT_UNMOUNT, we do not support it.
1550	 */
1551	return (ENOTSUP);
1552}
1553
1554/*ARGSUSED*/
1555static int
1556zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
1557{
1558	char		*osname;
1559	pathname_t	spn;
1560	int		error = 0;
1561	uio_seg_t	fromspace = (uap->flags & MS_SYSSPACE) ?
1562	    UIO_SYSSPACE : UIO_USERSPACE;
1563	int		canwrite;
1564
1565	if (mvp->v_type != VDIR)
1566		return (ENOTDIR);
1567
1568	mutex_enter(&mvp->v_lock);
1569	if ((uap->flags & MS_REMOUNT) == 0 &&
1570	    (uap->flags & MS_OVERLAY) == 0 &&
1571	    (mvp->v_count != 1 || (mvp->v_flag & VROOT))) {
1572		mutex_exit(&mvp->v_lock);
1573		return (EBUSY);
1574	}
1575	mutex_exit(&mvp->v_lock);
1576
1577	/*
1578	 * ZFS does not support passing unparsed data in via MS_DATA.
1579	 * Users should use the MS_OPTIONSTR interface; this means
1580	 * that all option parsing is already done and the options struct
1581	 * can be interrogated.
1582	 */
1583	if ((uap->flags & MS_DATA) && uap->datalen > 0)
1584		return (EINVAL);
1585
1586	/*
1587	 * Get the objset name (the "special" mount argument).
1588	 */
1589	if (error = pn_get(uap->spec, fromspace, &spn))
1590		return (error);
1591
1592	osname = spn.pn_path;
1593
1594	/*
1595	 * Check for mount privilege?
1596	 *
1597	 * If we don't have privilege then see if
1598	 * we have local permission to allow it
1599	 */
1600	error = secpolicy_fs_mount(cr, mvp, vfsp);
1601	if (error) {
1602		if (dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) == 0) {
1603			vattr_t		vattr;
1604
1605			/*
1606			 * Make sure user is the owner of the mount point
1607			 * or has sufficient privileges.
1608			 */
1609
1610			vattr.va_mask = AT_UID;
1611
1612			if (VOP_GETATTR(mvp, &vattr, 0, cr, NULL)) {
1613				goto out;
1614			}
1615
1616			if (secpolicy_vnode_owner(cr, vattr.va_uid) != 0 &&
1617			    VOP_ACCESS(mvp, VWRITE, 0, cr, NULL) != 0) {
1618				goto out;
1619			}
1620			secpolicy_fs_mount_clearopts(cr, vfsp);
1621		} else {
1622			goto out;
1623		}
1624	}
1625
1626	/*
1627	 * Refuse to mount a filesystem if we are in a local zone and the
1628	 * dataset is not visible.
1629	 */
1630	if (!INGLOBALZONE(curproc) &&
1631	    (!zone_dataset_visible(osname, &canwrite) || !canwrite)) {
1632		error = EPERM;
1633		goto out;
1634	}
1635
1636	error = zfs_mount_label_policy(vfsp, osname);
1637	if (error)
1638		goto out;
1639
1640	/*
1641	 * When doing a remount, we simply refresh our temporary properties
1642	 * according to those options set in the current VFS options.
1643	 */
1644	if (uap->flags & MS_REMOUNT) {
1645		/* refresh mount options */
1646		zfs_unregister_callbacks(vfsp->vfs_data);
1647		error = zfs_register_callbacks(vfsp);
1648		goto out;
1649	}
1650
1651	error = zfs_domount(vfsp, osname);
1652
1653	/*
1654	 * Add an extra VFS_HOLD on our parent vfs so that it can't
1655	 * disappear due to a forced unmount.
1656	 */
1657	if (error == 0 && ((zfsvfs_t *)vfsp->vfs_data)->z_issnap)
1658		VFS_HOLD(mvp->v_vfsp);
1659
1660out:
1661	pn_free(&spn);
1662	return (error);
1663}
1664
1665static int
1666zfs_statvfs(vfs_t *vfsp, struct statvfs64 *statp)
1667{
1668	zfsvfs_t *zfsvfs = vfsp->vfs_data;
1669	dev32_t d32;
1670	uint64_t refdbytes, availbytes, usedobjs, availobjs;
1671
1672	ZFS_ENTER(zfsvfs);
1673
1674	dmu_objset_space(zfsvfs->z_os,
1675	    &refdbytes, &availbytes, &usedobjs, &availobjs);
1676
1677	/*
1678	 * The underlying storage pool actually uses multiple block sizes.
1679	 * We report the fragsize as the smallest block size we support,
1680	 * and we report our blocksize as the filesystem's maximum blocksize.
1681	 */
1682	statp->f_frsize = 1UL << SPA_MINBLOCKSHIFT;
1683	statp->f_bsize = zfsvfs->z_max_blksz;
1684
1685	/*
1686	 * The following report "total" blocks of various kinds in the
1687	 * file system, but reported in terms of f_frsize - the
1688	 * "fragment" size.
1689	 */
1690
1691	statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT;
1692	statp->f_bfree = availbytes >> SPA_MINBLOCKSHIFT;
1693	statp->f_bavail = statp->f_bfree; /* no root reservation */
1694
1695	/*
1696	 * statvfs() should really be called statufs(), because it assumes
1697	 * static metadata.  ZFS doesn't preallocate files, so the best
1698	 * we can do is report the max that could possibly fit in f_files,
1699	 * and that minus the number actually used in f_ffree.
1700	 * For f_ffree, report the smaller of the number of object available
1701	 * and the number of blocks (each object will take at least a block).
1702	 */
1703	statp->f_ffree = MIN(availobjs, statp->f_bfree);
1704	statp->f_favail = statp->f_ffree;	/* no "root reservation" */
1705	statp->f_files = statp->f_ffree + usedobjs;
1706
1707	(void) cmpldev(&d32, vfsp->vfs_dev);
1708	statp->f_fsid = d32;
1709
1710	/*
1711	 * We're a zfs filesystem.
1712	 */
1713	(void) strcpy(statp->f_basetype, vfssw[vfsp->vfs_fstype].vsw_name);
1714
1715	statp->f_flag = vf_to_stf(vfsp->vfs_flag);
1716
1717	statp->f_namemax = ZFS_MAXNAMELEN;
1718
1719	/*
1720	 * We have all of 32 characters to stuff a string here.
1721	 * Is there anything useful we could/should provide?
1722	 */
1723	bzero(statp->f_fstr, sizeof (statp->f_fstr));
1724
1725	ZFS_EXIT(zfsvfs);
1726	return (0);
1727}
1728
1729static int
1730zfs_root(vfs_t *vfsp, vnode_t **vpp)
1731{
1732	zfsvfs_t *zfsvfs = vfsp->vfs_data;
1733	znode_t *rootzp;
1734	int error;
1735
1736	ZFS_ENTER(zfsvfs);
1737
1738	error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp);
1739	if (error == 0)
1740		*vpp = ZTOV(rootzp);
1741
1742	ZFS_EXIT(zfsvfs);
1743	return (error);
1744}
1745
1746/*
1747 * Teardown the zfsvfs::z_os.
1748 *
1749 * Note, if 'unmounting' if FALSE, we return with the 'z_teardown_lock'
1750 * and 'z_teardown_inactive_lock' held.
1751 */
1752static int
1753zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
1754{
1755	znode_t	*zp;
1756
1757	rrw_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
1758
1759	if (!unmounting) {
1760		/*
1761		 * We purge the parent filesystem's vfsp as the parent
1762		 * filesystem and all of its snapshots have their vnode's
1763		 * v_vfsp set to the parent's filesystem's vfsp.  Note,
1764		 * 'z_parent' is self referential for non-snapshots.
1765		 */
1766		(void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0);
1767	}
1768
1769	/*
1770	 * Close the zil. NB: Can't close the zil while zfs_inactive
1771	 * threads are blocked as zil_close can call zfs_inactive.
1772	 */
1773	if (zfsvfs->z_log) {
1774		zil_close(zfsvfs->z_log);
1775		zfsvfs->z_log = NULL;
1776	}
1777
1778	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_WRITER);
1779
1780	/*
1781	 * If we are not unmounting (ie: online recv) and someone already
1782	 * unmounted this file system while we were doing the switcheroo,
1783	 * or a reopen of z_os failed then just bail out now.
1784	 */
1785	if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) {
1786		rw_exit(&zfsvfs->z_teardown_inactive_lock);
1787		rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
1788		return (EIO);
1789	}
1790
1791	/*
1792	 * At this point there are no vops active, and any new vops will
1793	 * fail with EIO since we have z_teardown_lock for writer (only
1794	 * relavent for forced unmount).
1795	 *
1796	 * Release all holds on dbufs.
1797	 */
1798	mutex_enter(&zfsvfs->z_znodes_lock);
1799	for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL;
1800	    zp = list_next(&zfsvfs->z_all_znodes, zp))
1801		if (zp->z_sa_hdl) {
1802			ASSERT(ZTOV(zp)->v_count > 0);
1803			zfs_znode_dmu_fini(zp);
1804		}
1805	mutex_exit(&zfsvfs->z_znodes_lock);
1806
1807	/*
1808	 * If we are unmounting, set the unmounted flag and let new vops
1809	 * unblock.  zfs_inactive will have the unmounted behavior, and all
1810	 * other vops will fail with EIO.
1811	 */
1812	if (unmounting) {
1813		zfsvfs->z_unmounted = B_TRUE;
1814		rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
1815		rw_exit(&zfsvfs->z_teardown_inactive_lock);
1816	}
1817
1818	/*
1819	 * z_os will be NULL if there was an error in attempting to reopen
1820	 * zfsvfs, so just return as the properties had already been
1821	 * unregistered and cached data had been evicted before.
1822	 */
1823	if (zfsvfs->z_os == NULL)
1824		return (0);
1825
1826	/*
1827	 * Unregister properties.
1828	 */
1829	zfs_unregister_callbacks(zfsvfs);
1830
1831	/*
1832	 * Evict cached data
1833	 */
1834	if (dmu_objset_is_dirty_anywhere(zfsvfs->z_os))
1835		if (!(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY))
1836			txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
1837	(void) dmu_objset_evict_dbufs(zfsvfs->z_os);
1838
1839	return (0);
1840}
1841
1842/*ARGSUSED*/
1843static int
1844zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr)
1845{
1846	zfsvfs_t *zfsvfs = vfsp->vfs_data;
1847	objset_t *os;
1848	int ret;
1849
1850	ret = secpolicy_fs_unmount(cr, vfsp);
1851	if (ret) {
1852		if (dsl_deleg_access((char *)refstr_value(vfsp->vfs_resource),
1853		    ZFS_DELEG_PERM_MOUNT, cr))
1854			return (ret);
1855	}
1856
1857	/*
1858	 * We purge the parent filesystem's vfsp as the parent filesystem
1859	 * and all of its snapshots have their vnode's v_vfsp set to the
1860	 * parent's filesystem's vfsp.  Note, 'z_parent' is self
1861	 * referential for non-snapshots.
1862	 */
1863	(void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0);
1864
1865	/*
1866	 * Unmount any snapshots mounted under .zfs before unmounting the
1867	 * dataset itself.
1868	 */
1869	if (zfsvfs->z_ctldir != NULL &&
1870	    (ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0) {
1871		return (ret);
1872	}
1873
1874	if (!(fflag & MS_FORCE)) {
1875		/*
1876		 * Check the number of active vnodes in the file system.
1877		 * Our count is maintained in the vfs structure, but the
1878		 * number is off by 1 to indicate a hold on the vfs
1879		 * structure itself.
1880		 *
1881		 * The '.zfs' directory maintains a reference of its
1882		 * own, and any active references underneath are
1883		 * reflected in the vnode count.
1884		 */
1885		if (zfsvfs->z_ctldir == NULL) {
1886			if (vfsp->vfs_count > 1)
1887				return (EBUSY);
1888		} else {
1889			if (vfsp->vfs_count > 2 ||
1890			    zfsvfs->z_ctldir->v_count > 1)
1891				return (EBUSY);
1892		}
1893	}
1894
1895	vfsp->vfs_flag |= VFS_UNMOUNTED;
1896
1897	VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0);
1898	os = zfsvfs->z_os;
1899
1900	/*
1901	 * z_os will be NULL if there was an error in
1902	 * attempting to reopen zfsvfs.
1903	 */
1904	if (os != NULL) {
1905		/*
1906		 * Unset the objset user_ptr.
1907		 */
1908		mutex_enter(&os->os_user_ptr_lock);
1909		dmu_objset_set_user(os, NULL);
1910		mutex_exit(&os->os_user_ptr_lock);
1911
1912		/*
1913		 * Finally release the objset
1914		 */
1915		dmu_objset_disown(os, zfsvfs);
1916	}
1917
1918	/*
1919	 * We can now safely destroy the '.zfs' directory node.
1920	 */
1921	if (zfsvfs->z_ctldir != NULL)
1922		zfsctl_destroy(zfsvfs);
1923
1924	return (0);
1925}
1926
1927static int
1928zfs_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp

Large files files are truncated, but you can click here to view the full file