PageRenderTime 106ms CodeModel.GetById 3ms app.highlight 90ms RepoModel.GetById 1ms app.codeStats 1ms

/fs/xfs/xfs_vnodeops.c

https://bitbucket.org/lgorence/linux
C | 1874 lines | 1207 code | 230 blank | 437 comment | 289 complexity | d3181dd8ca8c15e78d5093069647bce7 MD5 | raw file
   1/*
   2 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
   3 * Copyright (c) 2012 Red Hat, Inc.
   4 * All Rights Reserved.
   5 *
   6 * This program is free software; you can redistribute it and/or
   7 * modify it under the terms of the GNU General Public License as
   8 * published by the Free Software Foundation.
   9 *
  10 * This program is distributed in the hope that it would be useful,
  11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 * GNU General Public License for more details.
  14 *
  15 * You should have received a copy of the GNU General Public License
  16 * along with this program; if not, write the Free Software Foundation,
  17 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  18 */
  19
  20#include "xfs.h"
  21#include "xfs_fs.h"
  22#include "xfs_types.h"
  23#include "xfs_bit.h"
  24#include "xfs_log.h"
  25#include "xfs_trans.h"
  26#include "xfs_sb.h"
  27#include "xfs_ag.h"
  28#include "xfs_dir2.h"
  29#include "xfs_mount.h"
  30#include "xfs_da_btree.h"
  31#include "xfs_bmap_btree.h"
  32#include "xfs_ialloc_btree.h"
  33#include "xfs_dinode.h"
  34#include "xfs_inode.h"
  35#include "xfs_inode_item.h"
  36#include "xfs_itable.h"
  37#include "xfs_ialloc.h"
  38#include "xfs_alloc.h"
  39#include "xfs_bmap.h"
  40#include "xfs_acl.h"
  41#include "xfs_attr.h"
  42#include "xfs_error.h"
  43#include "xfs_quota.h"
  44#include "xfs_utils.h"
  45#include "xfs_rtalloc.h"
  46#include "xfs_trans_space.h"
  47#include "xfs_log_priv.h"
  48#include "xfs_filestream.h"
  49#include "xfs_vnodeops.h"
  50#include "xfs_trace.h"
  51#include "xfs_icache.h"
  52#include "xfs_symlink.h"
  53
  54
  55/*
  56 * This is called by xfs_inactive to free any blocks beyond eof
  57 * when the link count isn't zero and by xfs_dm_punch_hole() when
  58 * punching a hole to EOF.
  59 */
  60int
  61xfs_free_eofblocks(
  62	xfs_mount_t	*mp,
  63	xfs_inode_t	*ip,
  64	bool		need_iolock)
  65{
  66	xfs_trans_t	*tp;
  67	int		error;
  68	xfs_fileoff_t	end_fsb;
  69	xfs_fileoff_t	last_fsb;
  70	xfs_filblks_t	map_len;
  71	int		nimaps;
  72	xfs_bmbt_irec_t	imap;
  73
  74	/*
  75	 * Figure out if there are any blocks beyond the end
  76	 * of the file.  If not, then there is nothing to do.
  77	 */
  78	end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_ISIZE(ip));
  79	last_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
  80	if (last_fsb <= end_fsb)
  81		return 0;
  82	map_len = last_fsb - end_fsb;
  83
  84	nimaps = 1;
  85	xfs_ilock(ip, XFS_ILOCK_SHARED);
  86	error = xfs_bmapi_read(ip, end_fsb, map_len, &imap, &nimaps, 0);
  87	xfs_iunlock(ip, XFS_ILOCK_SHARED);
  88
  89	if (!error && (nimaps != 0) &&
  90	    (imap.br_startblock != HOLESTARTBLOCK ||
  91	     ip->i_delayed_blks)) {
  92		/*
  93		 * Attach the dquots to the inode up front.
  94		 */
  95		error = xfs_qm_dqattach(ip, 0);
  96		if (error)
  97			return error;
  98
  99		/*
 100		 * There are blocks after the end of file.
 101		 * Free them up now by truncating the file to
 102		 * its current size.
 103		 */
 104		tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
 105
 106		if (need_iolock) {
 107			if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
 108				xfs_trans_cancel(tp, 0);
 109				return EAGAIN;
 110			}
 111		}
 112
 113		error = xfs_trans_reserve(tp, 0,
 114					  XFS_ITRUNCATE_LOG_RES(mp),
 115					  0, XFS_TRANS_PERM_LOG_RES,
 116					  XFS_ITRUNCATE_LOG_COUNT);
 117		if (error) {
 118			ASSERT(XFS_FORCED_SHUTDOWN(mp));
 119			xfs_trans_cancel(tp, 0);
 120			if (need_iolock)
 121				xfs_iunlock(ip, XFS_IOLOCK_EXCL);
 122			return error;
 123		}
 124
 125		xfs_ilock(ip, XFS_ILOCK_EXCL);
 126		xfs_trans_ijoin(tp, ip, 0);
 127
 128		/*
 129		 * Do not update the on-disk file size.  If we update the
 130		 * on-disk file size and then the system crashes before the
 131		 * contents of the file are flushed to disk then the files
 132		 * may be full of holes (ie NULL files bug).
 133		 */
 134		error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK,
 135					      XFS_ISIZE(ip));
 136		if (error) {
 137			/*
 138			 * If we get an error at this point we simply don't
 139			 * bother truncating the file.
 140			 */
 141			xfs_trans_cancel(tp,
 142					 (XFS_TRANS_RELEASE_LOG_RES |
 143					  XFS_TRANS_ABORT));
 144		} else {
 145			error = xfs_trans_commit(tp,
 146						XFS_TRANS_RELEASE_LOG_RES);
 147			if (!error)
 148				xfs_inode_clear_eofblocks_tag(ip);
 149		}
 150
 151		xfs_iunlock(ip, XFS_ILOCK_EXCL);
 152		if (need_iolock)
 153			xfs_iunlock(ip, XFS_IOLOCK_EXCL);
 154	}
 155	return error;
 156}
 157
 158int
 159xfs_release(
 160	xfs_inode_t	*ip)
 161{
 162	xfs_mount_t	*mp = ip->i_mount;
 163	int		error;
 164
 165	if (!S_ISREG(ip->i_d.di_mode) || (ip->i_d.di_mode == 0))
 166		return 0;
 167
 168	/* If this is a read-only mount, don't do this (would generate I/O) */
 169	if (mp->m_flags & XFS_MOUNT_RDONLY)
 170		return 0;
 171
 172	if (!XFS_FORCED_SHUTDOWN(mp)) {
 173		int truncated;
 174
 175		/*
 176		 * If we are using filestreams, and we have an unlinked
 177		 * file that we are processing the last close on, then nothing
 178		 * will be able to reopen and write to this file. Purge this
 179		 * inode from the filestreams cache so that it doesn't delay
 180		 * teardown of the inode.
 181		 */
 182		if ((ip->i_d.di_nlink == 0) && xfs_inode_is_filestream(ip))
 183			xfs_filestream_deassociate(ip);
 184
 185		/*
 186		 * If we previously truncated this file and removed old data
 187		 * in the process, we want to initiate "early" writeout on
 188		 * the last close.  This is an attempt to combat the notorious
 189		 * NULL files problem which is particularly noticeable from a
 190		 * truncate down, buffered (re-)write (delalloc), followed by
 191		 * a crash.  What we are effectively doing here is
 192		 * significantly reducing the time window where we'd otherwise
 193		 * be exposed to that problem.
 194		 */
 195		truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED);
 196		if (truncated) {
 197			xfs_iflags_clear(ip, XFS_IDIRTY_RELEASE);
 198			if (VN_DIRTY(VFS_I(ip)) && ip->i_delayed_blks > 0) {
 199				error = -filemap_flush(VFS_I(ip)->i_mapping);
 200				if (error)
 201					return error;
 202			}
 203		}
 204	}
 205
 206	if (ip->i_d.di_nlink == 0)
 207		return 0;
 208
 209	if (xfs_can_free_eofblocks(ip, false)) {
 210
 211		/*
 212		 * If we can't get the iolock just skip truncating the blocks
 213		 * past EOF because we could deadlock with the mmap_sem
 214		 * otherwise.  We'll get another chance to drop them once the
 215		 * last reference to the inode is dropped, so we'll never leak
 216		 * blocks permanently.
 217		 *
 218		 * Further, check if the inode is being opened, written and
 219		 * closed frequently and we have delayed allocation blocks
 220		 * outstanding (e.g. streaming writes from the NFS server),
 221		 * truncating the blocks past EOF will cause fragmentation to
 222		 * occur.
 223		 *
 224		 * In this case don't do the truncation, either, but we have to
 225		 * be careful how we detect this case. Blocks beyond EOF show
 226		 * up as i_delayed_blks even when the inode is clean, so we
 227		 * need to truncate them away first before checking for a dirty
 228		 * release. Hence on the first dirty close we will still remove
 229		 * the speculative allocation, but after that we will leave it
 230		 * in place.
 231		 */
 232		if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE))
 233			return 0;
 234
 235		error = xfs_free_eofblocks(mp, ip, true);
 236		if (error && error != EAGAIN)
 237			return error;
 238
 239		/* delalloc blocks after truncation means it really is dirty */
 240		if (ip->i_delayed_blks)
 241			xfs_iflags_set(ip, XFS_IDIRTY_RELEASE);
 242	}
 243	return 0;
 244}
 245
 246/*
 247 * xfs_inactive
 248 *
 249 * This is called when the vnode reference count for the vnode
 250 * goes to zero.  If the file has been unlinked, then it must
 251 * now be truncated.  Also, we clear all of the read-ahead state
 252 * kept for the inode here since the file is now closed.
 253 */
 254int
 255xfs_inactive(
 256	xfs_inode_t	*ip)
 257{
 258	xfs_bmap_free_t	free_list;
 259	xfs_fsblock_t	first_block;
 260	int		committed;
 261	xfs_trans_t	*tp;
 262	xfs_mount_t	*mp;
 263	int		error;
 264	int		truncate = 0;
 265
 266	/*
 267	 * If the inode is already free, then there can be nothing
 268	 * to clean up here.
 269	 */
 270	if (ip->i_d.di_mode == 0 || is_bad_inode(VFS_I(ip))) {
 271		ASSERT(ip->i_df.if_real_bytes == 0);
 272		ASSERT(ip->i_df.if_broot_bytes == 0);
 273		return VN_INACTIVE_CACHE;
 274	}
 275
 276	mp = ip->i_mount;
 277
 278	error = 0;
 279
 280	/* If this is a read-only mount, don't do this (would generate I/O) */
 281	if (mp->m_flags & XFS_MOUNT_RDONLY)
 282		goto out;
 283
 284	if (ip->i_d.di_nlink != 0) {
 285		/*
 286		 * force is true because we are evicting an inode from the
 287		 * cache. Post-eof blocks must be freed, lest we end up with
 288		 * broken free space accounting.
 289		 */
 290		if (xfs_can_free_eofblocks(ip, true)) {
 291			error = xfs_free_eofblocks(mp, ip, false);
 292			if (error)
 293				return VN_INACTIVE_CACHE;
 294		}
 295		goto out;
 296	}
 297
 298	if (S_ISREG(ip->i_d.di_mode) &&
 299	    (ip->i_d.di_size != 0 || XFS_ISIZE(ip) != 0 ||
 300	     ip->i_d.di_nextents > 0 || ip->i_delayed_blks > 0))
 301		truncate = 1;
 302
 303	error = xfs_qm_dqattach(ip, 0);
 304	if (error)
 305		return VN_INACTIVE_CACHE;
 306
 307	tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
 308	error = xfs_trans_reserve(tp, 0,
 309			(truncate || S_ISLNK(ip->i_d.di_mode)) ?
 310				XFS_ITRUNCATE_LOG_RES(mp) :
 311				XFS_IFREE_LOG_RES(mp),
 312			0,
 313			XFS_TRANS_PERM_LOG_RES,
 314			XFS_ITRUNCATE_LOG_COUNT);
 315	if (error) {
 316		ASSERT(XFS_FORCED_SHUTDOWN(mp));
 317		xfs_trans_cancel(tp, 0);
 318		return VN_INACTIVE_CACHE;
 319	}
 320
 321	xfs_ilock(ip, XFS_ILOCK_EXCL);
 322	xfs_trans_ijoin(tp, ip, 0);
 323
 324	if (S_ISLNK(ip->i_d.di_mode)) {
 325		/*
 326		 * Zero length symlinks _can_ exist.
 327		 */
 328		if (ip->i_d.di_size > XFS_IFORK_DSIZE(ip)) {
 329			error = xfs_inactive_symlink_rmt(ip, &tp);
 330			if (error)
 331				goto out_cancel;
 332		} else if (ip->i_df.if_bytes > 0) {
 333			xfs_idata_realloc(ip, -(ip->i_df.if_bytes),
 334					  XFS_DATA_FORK);
 335			ASSERT(ip->i_df.if_bytes == 0);
 336		}
 337	} else if (truncate) {
 338		ip->i_d.di_size = 0;
 339		xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 340
 341		error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0);
 342		if (error)
 343			goto out_cancel;
 344
 345		ASSERT(ip->i_d.di_nextents == 0);
 346	}
 347
 348	/*
 349	 * If there are attributes associated with the file then blow them away
 350	 * now.  The code calls a routine that recursively deconstructs the
 351	 * attribute fork.  We need to just commit the current transaction
 352	 * because we can't use it for xfs_attr_inactive().
 353	 */
 354	if (ip->i_d.di_anextents > 0) {
 355		ASSERT(ip->i_d.di_forkoff != 0);
 356
 357		error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
 358		if (error)
 359			goto out_unlock;
 360
 361		xfs_iunlock(ip, XFS_ILOCK_EXCL);
 362
 363		error = xfs_attr_inactive(ip);
 364		if (error)
 365			goto out;
 366
 367		tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
 368		error = xfs_trans_reserve(tp, 0,
 369					  XFS_IFREE_LOG_RES(mp),
 370					  0, XFS_TRANS_PERM_LOG_RES,
 371					  XFS_INACTIVE_LOG_COUNT);
 372		if (error) {
 373			xfs_trans_cancel(tp, 0);
 374			goto out;
 375		}
 376
 377		xfs_ilock(ip, XFS_ILOCK_EXCL);
 378		xfs_trans_ijoin(tp, ip, 0);
 379	}
 380
 381	if (ip->i_afp)
 382		xfs_idestroy_fork(ip, XFS_ATTR_FORK);
 383
 384	ASSERT(ip->i_d.di_anextents == 0);
 385
 386	/*
 387	 * Free the inode.
 388	 */
 389	xfs_bmap_init(&free_list, &first_block);
 390	error = xfs_ifree(tp, ip, &free_list);
 391	if (error) {
 392		/*
 393		 * If we fail to free the inode, shut down.  The cancel
 394		 * might do that, we need to make sure.  Otherwise the
 395		 * inode might be lost for a long time or forever.
 396		 */
 397		if (!XFS_FORCED_SHUTDOWN(mp)) {
 398			xfs_notice(mp, "%s: xfs_ifree returned error %d",
 399				__func__, error);
 400			xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
 401		}
 402		xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
 403	} else {
 404		/*
 405		 * Credit the quota account(s). The inode is gone.
 406		 */
 407		xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_ICOUNT, -1);
 408
 409		/*
 410		 * Just ignore errors at this point.  There is nothing we can
 411		 * do except to try to keep going. Make sure it's not a silent
 412		 * error.
 413		 */
 414		error = xfs_bmap_finish(&tp,  &free_list, &committed);
 415		if (error)
 416			xfs_notice(mp, "%s: xfs_bmap_finish returned error %d",
 417				__func__, error);
 418		error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
 419		if (error)
 420			xfs_notice(mp, "%s: xfs_trans_commit returned error %d",
 421				__func__, error);
 422	}
 423
 424	/*
 425	 * Release the dquots held by inode, if any.
 426	 */
 427	xfs_qm_dqdetach(ip);
 428out_unlock:
 429	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 430out:
 431	return VN_INACTIVE_CACHE;
 432out_cancel:
 433	xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
 434	goto out_unlock;
 435}
 436
 437/*
 438 * Lookups up an inode from "name". If ci_name is not NULL, then a CI match
 439 * is allowed, otherwise it has to be an exact match. If a CI match is found,
 440 * ci_name->name will point to a the actual name (caller must free) or
 441 * will be set to NULL if an exact match is found.
 442 */
 443int
 444xfs_lookup(
 445	xfs_inode_t		*dp,
 446	struct xfs_name		*name,
 447	xfs_inode_t		**ipp,
 448	struct xfs_name		*ci_name)
 449{
 450	xfs_ino_t		inum;
 451	int			error;
 452	uint			lock_mode;
 453
 454	trace_xfs_lookup(dp, name);
 455
 456	if (XFS_FORCED_SHUTDOWN(dp->i_mount))
 457		return XFS_ERROR(EIO);
 458
 459	lock_mode = xfs_ilock_map_shared(dp);
 460	error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name);
 461	xfs_iunlock_map_shared(dp, lock_mode);
 462
 463	if (error)
 464		goto out;
 465
 466	error = xfs_iget(dp->i_mount, NULL, inum, 0, 0, ipp);
 467	if (error)
 468		goto out_free_name;
 469
 470	return 0;
 471
 472out_free_name:
 473	if (ci_name)
 474		kmem_free(ci_name->name);
 475out:
 476	*ipp = NULL;
 477	return error;
 478}
 479
 480int
 481xfs_create(
 482	xfs_inode_t		*dp,
 483	struct xfs_name		*name,
 484	umode_t			mode,
 485	xfs_dev_t		rdev,
 486	xfs_inode_t		**ipp)
 487{
 488	int			is_dir = S_ISDIR(mode);
 489	struct xfs_mount	*mp = dp->i_mount;
 490	struct xfs_inode	*ip = NULL;
 491	struct xfs_trans	*tp = NULL;
 492	int			error;
 493	xfs_bmap_free_t		free_list;
 494	xfs_fsblock_t		first_block;
 495	bool                    unlock_dp_on_error = false;
 496	uint			cancel_flags;
 497	int			committed;
 498	prid_t			prid;
 499	struct xfs_dquot	*udqp = NULL;
 500	struct xfs_dquot	*gdqp = NULL;
 501	uint			resblks;
 502	uint			log_res;
 503	uint			log_count;
 504
 505	trace_xfs_create(dp, name);
 506
 507	if (XFS_FORCED_SHUTDOWN(mp))
 508		return XFS_ERROR(EIO);
 509
 510	if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
 511		prid = xfs_get_projid(dp);
 512	else
 513		prid = XFS_PROJID_DEFAULT;
 514
 515	/*
 516	 * Make sure that we have allocated dquot(s) on disk.
 517	 */
 518	error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid,
 519			XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
 520	if (error)
 521		return error;
 522
 523	if (is_dir) {
 524		rdev = 0;
 525		resblks = XFS_MKDIR_SPACE_RES(mp, name->len);
 526		log_res = XFS_MKDIR_LOG_RES(mp);
 527		log_count = XFS_MKDIR_LOG_COUNT;
 528		tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR);
 529	} else {
 530		resblks = XFS_CREATE_SPACE_RES(mp, name->len);
 531		log_res = XFS_CREATE_LOG_RES(mp);
 532		log_count = XFS_CREATE_LOG_COUNT;
 533		tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE);
 534	}
 535
 536	cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
 537
 538	/*
 539	 * Initially assume that the file does not exist and
 540	 * reserve the resources for that case.  If that is not
 541	 * the case we'll drop the one we have and get a more
 542	 * appropriate transaction later.
 543	 */
 544	error = xfs_trans_reserve(tp, resblks, log_res, 0,
 545			XFS_TRANS_PERM_LOG_RES, log_count);
 546	if (error == ENOSPC) {
 547		/* flush outstanding delalloc blocks and retry */
 548		xfs_flush_inodes(mp);
 549		error = xfs_trans_reserve(tp, resblks, log_res, 0,
 550				XFS_TRANS_PERM_LOG_RES, log_count);
 551	}
 552	if (error == ENOSPC) {
 553		/* No space at all so try a "no-allocation" reservation */
 554		resblks = 0;
 555		error = xfs_trans_reserve(tp, 0, log_res, 0,
 556				XFS_TRANS_PERM_LOG_RES, log_count);
 557	}
 558	if (error) {
 559		cancel_flags = 0;
 560		goto out_trans_cancel;
 561	}
 562
 563	xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
 564	unlock_dp_on_error = true;
 565
 566	xfs_bmap_init(&free_list, &first_block);
 567
 568	/*
 569	 * Reserve disk quota and the inode.
 570	 */
 571	error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, resblks, 1, 0);
 572	if (error)
 573		goto out_trans_cancel;
 574
 575	error = xfs_dir_canenter(tp, dp, name, resblks);
 576	if (error)
 577		goto out_trans_cancel;
 578
 579	/*
 580	 * A newly created regular or special file just has one directory
 581	 * entry pointing to them, but a directory also the "." entry
 582	 * pointing to itself.
 583	 */
 584	error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev,
 585			       prid, resblks > 0, &ip, &committed);
 586	if (error) {
 587		if (error == ENOSPC)
 588			goto out_trans_cancel;
 589		goto out_trans_abort;
 590	}
 591
 592	/*
 593	 * Now we join the directory inode to the transaction.  We do not do it
 594	 * earlier because xfs_dir_ialloc might commit the previous transaction
 595	 * (and release all the locks).  An error from here on will result in
 596	 * the transaction cancel unlocking dp so don't do it explicitly in the
 597	 * error path.
 598	 */
 599	xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
 600	unlock_dp_on_error = false;
 601
 602	error = xfs_dir_createname(tp, dp, name, ip->i_ino,
 603					&first_block, &free_list, resblks ?
 604					resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
 605	if (error) {
 606		ASSERT(error != ENOSPC);
 607		goto out_trans_abort;
 608	}
 609	xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
 610	xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
 611
 612	if (is_dir) {
 613		error = xfs_dir_init(tp, ip, dp);
 614		if (error)
 615			goto out_bmap_cancel;
 616
 617		error = xfs_bumplink(tp, dp);
 618		if (error)
 619			goto out_bmap_cancel;
 620	}
 621
 622	/*
 623	 * If this is a synchronous mount, make sure that the
 624	 * create transaction goes to disk before returning to
 625	 * the user.
 626	 */
 627	if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
 628		xfs_trans_set_sync(tp);
 629
 630	/*
 631	 * Attach the dquot(s) to the inodes and modify them incore.
 632	 * These ids of the inode couldn't have changed since the new
 633	 * inode has been locked ever since it was created.
 634	 */
 635	xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp);
 636
 637	error = xfs_bmap_finish(&tp, &free_list, &committed);
 638	if (error)
 639		goto out_bmap_cancel;
 640
 641	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
 642	if (error)
 643		goto out_release_inode;
 644
 645	xfs_qm_dqrele(udqp);
 646	xfs_qm_dqrele(gdqp);
 647
 648	*ipp = ip;
 649	return 0;
 650
 651 out_bmap_cancel:
 652	xfs_bmap_cancel(&free_list);
 653 out_trans_abort:
 654	cancel_flags |= XFS_TRANS_ABORT;
 655 out_trans_cancel:
 656	xfs_trans_cancel(tp, cancel_flags);
 657 out_release_inode:
 658	/*
 659	 * Wait until after the current transaction is aborted to
 660	 * release the inode.  This prevents recursive transactions
 661	 * and deadlocks from xfs_inactive.
 662	 */
 663	if (ip)
 664		IRELE(ip);
 665
 666	xfs_qm_dqrele(udqp);
 667	xfs_qm_dqrele(gdqp);
 668
 669	if (unlock_dp_on_error)
 670		xfs_iunlock(dp, XFS_ILOCK_EXCL);
 671	return error;
 672}
 673
 674#ifdef DEBUG
 675int xfs_locked_n;
 676int xfs_small_retries;
 677int xfs_middle_retries;
 678int xfs_lots_retries;
 679int xfs_lock_delays;
 680#endif
 681
 682/*
 683 * Bump the subclass so xfs_lock_inodes() acquires each lock with
 684 * a different value
 685 */
 686static inline int
 687xfs_lock_inumorder(int lock_mode, int subclass)
 688{
 689	if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))
 690		lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_IOLOCK_SHIFT;
 691	if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))
 692		lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_ILOCK_SHIFT;
 693
 694	return lock_mode;
 695}
 696
 697/*
 698 * The following routine will lock n inodes in exclusive mode.
 699 * We assume the caller calls us with the inodes in i_ino order.
 700 *
 701 * We need to detect deadlock where an inode that we lock
 702 * is in the AIL and we start waiting for another inode that is locked
 703 * by a thread in a long running transaction (such as truncate). This can
 704 * result in deadlock since the long running trans might need to wait
 705 * for the inode we just locked in order to push the tail and free space
 706 * in the log.
 707 */
 708void
 709xfs_lock_inodes(
 710	xfs_inode_t	**ips,
 711	int		inodes,
 712	uint		lock_mode)
 713{
 714	int		attempts = 0, i, j, try_lock;
 715	xfs_log_item_t	*lp;
 716
 717	ASSERT(ips && (inodes >= 2)); /* we need at least two */
 718
 719	try_lock = 0;
 720	i = 0;
 721
 722again:
 723	for (; i < inodes; i++) {
 724		ASSERT(ips[i]);
 725
 726		if (i && (ips[i] == ips[i-1]))	/* Already locked */
 727			continue;
 728
 729		/*
 730		 * If try_lock is not set yet, make sure all locked inodes
 731		 * are not in the AIL.
 732		 * If any are, set try_lock to be used later.
 733		 */
 734
 735		if (!try_lock) {
 736			for (j = (i - 1); j >= 0 && !try_lock; j--) {
 737				lp = (xfs_log_item_t *)ips[j]->i_itemp;
 738				if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
 739					try_lock++;
 740				}
 741			}
 742		}
 743
 744		/*
 745		 * If any of the previous locks we have locked is in the AIL,
 746		 * we must TRY to get the second and subsequent locks. If
 747		 * we can't get any, we must release all we have
 748		 * and try again.
 749		 */
 750
 751		if (try_lock) {
 752			/* try_lock must be 0 if i is 0. */
 753			/*
 754			 * try_lock means we have an inode locked
 755			 * that is in the AIL.
 756			 */
 757			ASSERT(i != 0);
 758			if (!xfs_ilock_nowait(ips[i], xfs_lock_inumorder(lock_mode, i))) {
 759				attempts++;
 760
 761				/*
 762				 * Unlock all previous guys and try again.
 763				 * xfs_iunlock will try to push the tail
 764				 * if the inode is in the AIL.
 765				 */
 766
 767				for(j = i - 1; j >= 0; j--) {
 768
 769					/*
 770					 * Check to see if we've already
 771					 * unlocked this one.
 772					 * Not the first one going back,
 773					 * and the inode ptr is the same.
 774					 */
 775					if ((j != (i - 1)) && ips[j] ==
 776								ips[j+1])
 777						continue;
 778
 779					xfs_iunlock(ips[j], lock_mode);
 780				}
 781
 782				if ((attempts % 5) == 0) {
 783					delay(1); /* Don't just spin the CPU */
 784#ifdef DEBUG
 785					xfs_lock_delays++;
 786#endif
 787				}
 788				i = 0;
 789				try_lock = 0;
 790				goto again;
 791			}
 792		} else {
 793			xfs_ilock(ips[i], xfs_lock_inumorder(lock_mode, i));
 794		}
 795	}
 796
 797#ifdef DEBUG
 798	if (attempts) {
 799		if (attempts < 5) xfs_small_retries++;
 800		else if (attempts < 100) xfs_middle_retries++;
 801		else xfs_lots_retries++;
 802	} else {
 803		xfs_locked_n++;
 804	}
 805#endif
 806}
 807
 808/*
 809 * xfs_lock_two_inodes() can only be used to lock one type of lock
 810 * at a time - the iolock or the ilock, but not both at once. If
 811 * we lock both at once, lockdep will report false positives saying
 812 * we have violated locking orders.
 813 */
 814void
 815xfs_lock_two_inodes(
 816	xfs_inode_t		*ip0,
 817	xfs_inode_t		*ip1,
 818	uint			lock_mode)
 819{
 820	xfs_inode_t		*temp;
 821	int			attempts = 0;
 822	xfs_log_item_t		*lp;
 823
 824	if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))
 825		ASSERT((lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) == 0);
 826	ASSERT(ip0->i_ino != ip1->i_ino);
 827
 828	if (ip0->i_ino > ip1->i_ino) {
 829		temp = ip0;
 830		ip0 = ip1;
 831		ip1 = temp;
 832	}
 833
 834 again:
 835	xfs_ilock(ip0, xfs_lock_inumorder(lock_mode, 0));
 836
 837	/*
 838	 * If the first lock we have locked is in the AIL, we must TRY to get
 839	 * the second lock. If we can't get it, we must release the first one
 840	 * and try again.
 841	 */
 842	lp = (xfs_log_item_t *)ip0->i_itemp;
 843	if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
 844		if (!xfs_ilock_nowait(ip1, xfs_lock_inumorder(lock_mode, 1))) {
 845			xfs_iunlock(ip0, lock_mode);
 846			if ((++attempts % 5) == 0)
 847				delay(1); /* Don't just spin the CPU */
 848			goto again;
 849		}
 850	} else {
 851		xfs_ilock(ip1, xfs_lock_inumorder(lock_mode, 1));
 852	}
 853}
 854
 855int
 856xfs_remove(
 857	xfs_inode_t             *dp,
 858	struct xfs_name		*name,
 859	xfs_inode_t		*ip)
 860{
 861	xfs_mount_t		*mp = dp->i_mount;
 862	xfs_trans_t             *tp = NULL;
 863	int			is_dir = S_ISDIR(ip->i_d.di_mode);
 864	int                     error = 0;
 865	xfs_bmap_free_t         free_list;
 866	xfs_fsblock_t           first_block;
 867	int			cancel_flags;
 868	int			committed;
 869	int			link_zero;
 870	uint			resblks;
 871	uint			log_count;
 872
 873	trace_xfs_remove(dp, name);
 874
 875	if (XFS_FORCED_SHUTDOWN(mp))
 876		return XFS_ERROR(EIO);
 877
 878	error = xfs_qm_dqattach(dp, 0);
 879	if (error)
 880		goto std_return;
 881
 882	error = xfs_qm_dqattach(ip, 0);
 883	if (error)
 884		goto std_return;
 885
 886	if (is_dir) {
 887		tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR);
 888		log_count = XFS_DEFAULT_LOG_COUNT;
 889	} else {
 890		tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE);
 891		log_count = XFS_REMOVE_LOG_COUNT;
 892	}
 893	cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
 894
 895	/*
 896	 * We try to get the real space reservation first,
 897	 * allowing for directory btree deletion(s) implying
 898	 * possible bmap insert(s).  If we can't get the space
 899	 * reservation then we use 0 instead, and avoid the bmap
 900	 * btree insert(s) in the directory code by, if the bmap
 901	 * insert tries to happen, instead trimming the LAST
 902	 * block from the directory.
 903	 */
 904	resblks = XFS_REMOVE_SPACE_RES(mp);
 905	error = xfs_trans_reserve(tp, resblks, XFS_REMOVE_LOG_RES(mp), 0,
 906				  XFS_TRANS_PERM_LOG_RES, log_count);
 907	if (error == ENOSPC) {
 908		resblks = 0;
 909		error = xfs_trans_reserve(tp, 0, XFS_REMOVE_LOG_RES(mp), 0,
 910					  XFS_TRANS_PERM_LOG_RES, log_count);
 911	}
 912	if (error) {
 913		ASSERT(error != ENOSPC);
 914		cancel_flags = 0;
 915		goto out_trans_cancel;
 916	}
 917
 918	xfs_lock_two_inodes(dp, ip, XFS_ILOCK_EXCL);
 919
 920	xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
 921	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
 922
 923	/*
 924	 * If we're removing a directory perform some additional validation.
 925	 */
 926	if (is_dir) {
 927		ASSERT(ip->i_d.di_nlink >= 2);
 928		if (ip->i_d.di_nlink != 2) {
 929			error = XFS_ERROR(ENOTEMPTY);
 930			goto out_trans_cancel;
 931		}
 932		if (!xfs_dir_isempty(ip)) {
 933			error = XFS_ERROR(ENOTEMPTY);
 934			goto out_trans_cancel;
 935		}
 936	}
 937
 938	xfs_bmap_init(&free_list, &first_block);
 939	error = xfs_dir_removename(tp, dp, name, ip->i_ino,
 940					&first_block, &free_list, resblks);
 941	if (error) {
 942		ASSERT(error != ENOENT);
 943		goto out_bmap_cancel;
 944	}
 945	xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
 946
 947	if (is_dir) {
 948		/*
 949		 * Drop the link from ip's "..".
 950		 */
 951		error = xfs_droplink(tp, dp);
 952		if (error)
 953			goto out_bmap_cancel;
 954
 955		/*
 956		 * Drop the "." link from ip to self.
 957		 */
 958		error = xfs_droplink(tp, ip);
 959		if (error)
 960			goto out_bmap_cancel;
 961	} else {
 962		/*
 963		 * When removing a non-directory we need to log the parent
 964		 * inode here.  For a directory this is done implicitly
 965		 * by the xfs_droplink call for the ".." entry.
 966		 */
 967		xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
 968	}
 969
 970	/*
 971	 * Drop the link from dp to ip.
 972	 */
 973	error = xfs_droplink(tp, ip);
 974	if (error)
 975		goto out_bmap_cancel;
 976
 977	/*
 978	 * Determine if this is the last link while
 979	 * we are in the transaction.
 980	 */
 981	link_zero = (ip->i_d.di_nlink == 0);
 982
 983	/*
 984	 * If this is a synchronous mount, make sure that the
 985	 * remove transaction goes to disk before returning to
 986	 * the user.
 987	 */
 988	if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
 989		xfs_trans_set_sync(tp);
 990
 991	error = xfs_bmap_finish(&tp, &free_list, &committed);
 992	if (error)
 993		goto out_bmap_cancel;
 994
 995	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
 996	if (error)
 997		goto std_return;
 998
 999	/*
1000	 * If we are using filestreams, kill the stream association.
1001	 * If the file is still open it may get a new one but that
1002	 * will get killed on last close in xfs_close() so we don't
1003	 * have to worry about that.
1004	 */
1005	if (!is_dir && link_zero && xfs_inode_is_filestream(ip))
1006		xfs_filestream_deassociate(ip);
1007
1008	return 0;
1009
1010 out_bmap_cancel:
1011	xfs_bmap_cancel(&free_list);
1012	cancel_flags |= XFS_TRANS_ABORT;
1013 out_trans_cancel:
1014	xfs_trans_cancel(tp, cancel_flags);
1015 std_return:
1016	return error;
1017}
1018
1019int
1020xfs_link(
1021	xfs_inode_t		*tdp,
1022	xfs_inode_t		*sip,
1023	struct xfs_name		*target_name)
1024{
1025	xfs_mount_t		*mp = tdp->i_mount;
1026	xfs_trans_t		*tp;
1027	int			error;
1028	xfs_bmap_free_t         free_list;
1029	xfs_fsblock_t           first_block;
1030	int			cancel_flags;
1031	int			committed;
1032	int			resblks;
1033
1034	trace_xfs_link(tdp, target_name);
1035
1036	ASSERT(!S_ISDIR(sip->i_d.di_mode));
1037
1038	if (XFS_FORCED_SHUTDOWN(mp))
1039		return XFS_ERROR(EIO);
1040
1041	error = xfs_qm_dqattach(sip, 0);
1042	if (error)
1043		goto std_return;
1044
1045	error = xfs_qm_dqattach(tdp, 0);
1046	if (error)
1047		goto std_return;
1048
1049	tp = xfs_trans_alloc(mp, XFS_TRANS_LINK);
1050	cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
1051	resblks = XFS_LINK_SPACE_RES(mp, target_name->len);
1052	error = xfs_trans_reserve(tp, resblks, XFS_LINK_LOG_RES(mp), 0,
1053			XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT);
1054	if (error == ENOSPC) {
1055		resblks = 0;
1056		error = xfs_trans_reserve(tp, 0, XFS_LINK_LOG_RES(mp), 0,
1057				XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT);
1058	}
1059	if (error) {
1060		cancel_flags = 0;
1061		goto error_return;
1062	}
1063
1064	xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL);
1065
1066	xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
1067	xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
1068
1069	/*
1070	 * If we are using project inheritance, we only allow hard link
1071	 * creation in our tree when the project IDs are the same; else
1072	 * the tree quota mechanism could be circumvented.
1073	 */
1074	if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
1075		     (xfs_get_projid(tdp) != xfs_get_projid(sip)))) {
1076		error = XFS_ERROR(EXDEV);
1077		goto error_return;
1078	}
1079
1080	error = xfs_dir_canenter(tp, tdp, target_name, resblks);
1081	if (error)
1082		goto error_return;
1083
1084	xfs_bmap_init(&free_list, &first_block);
1085
1086	error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino,
1087					&first_block, &free_list, resblks);
1088	if (error)
1089		goto abort_return;
1090	xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
1091	xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
1092
1093	error = xfs_bumplink(tp, sip);
1094	if (error)
1095		goto abort_return;
1096
1097	/*
1098	 * If this is a synchronous mount, make sure that the
1099	 * link transaction goes to disk before returning to
1100	 * the user.
1101	 */
1102	if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
1103		xfs_trans_set_sync(tp);
1104	}
1105
1106	error = xfs_bmap_finish (&tp, &free_list, &committed);
1107	if (error) {
1108		xfs_bmap_cancel(&free_list);
1109		goto abort_return;
1110	}
1111
1112	return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1113
1114 abort_return:
1115	cancel_flags |= XFS_TRANS_ABORT;
1116 error_return:
1117	xfs_trans_cancel(tp, cancel_flags);
1118 std_return:
1119	return error;
1120}
1121
1122int
1123xfs_set_dmattrs(
1124	xfs_inode_t     *ip,
1125	u_int		evmask,
1126	u_int16_t	state)
1127{
1128	xfs_mount_t	*mp = ip->i_mount;
1129	xfs_trans_t	*tp;
1130	int		error;
1131
1132	if (!capable(CAP_SYS_ADMIN))
1133		return XFS_ERROR(EPERM);
1134
1135	if (XFS_FORCED_SHUTDOWN(mp))
1136		return XFS_ERROR(EIO);
1137
1138	tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS);
1139	error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES (mp), 0, 0, 0);
1140	if (error) {
1141		xfs_trans_cancel(tp, 0);
1142		return error;
1143	}
1144	xfs_ilock(ip, XFS_ILOCK_EXCL);
1145	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
1146
1147	ip->i_d.di_dmevmask = evmask;
1148	ip->i_d.di_dmstate  = state;
1149
1150	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1151	error = xfs_trans_commit(tp, 0);
1152
1153	return error;
1154}
1155
1156/*
1157 * xfs_alloc_file_space()
1158 *      This routine allocates disk space for the given file.
1159 *
1160 *	If alloc_type == 0, this request is for an ALLOCSP type
1161 *	request which will change the file size.  In this case, no
1162 *	DMAPI event will be generated by the call.  A TRUNCATE event
1163 *	will be generated later by xfs_setattr.
1164 *
1165 *	If alloc_type != 0, this request is for a RESVSP type
1166 *	request, and a DMAPI DM_EVENT_WRITE will be generated if the
1167 *	lower block boundary byte address is less than the file's
1168 *	length.
1169 *
1170 * RETURNS:
1171 *       0 on success
1172 *      errno on error
1173 *
1174 */
1175STATIC int
1176xfs_alloc_file_space(
1177	xfs_inode_t		*ip,
1178	xfs_off_t		offset,
1179	xfs_off_t		len,
1180	int			alloc_type,
1181	int			attr_flags)
1182{
1183	xfs_mount_t		*mp = ip->i_mount;
1184	xfs_off_t		count;
1185	xfs_filblks_t		allocated_fsb;
1186	xfs_filblks_t		allocatesize_fsb;
1187	xfs_extlen_t		extsz, temp;
1188	xfs_fileoff_t		startoffset_fsb;
1189	xfs_fsblock_t		firstfsb;
1190	int			nimaps;
1191	int			quota_flag;
1192	int			rt;
1193	xfs_trans_t		*tp;
1194	xfs_bmbt_irec_t		imaps[1], *imapp;
1195	xfs_bmap_free_t		free_list;
1196	uint			qblocks, resblks, resrtextents;
1197	int			committed;
1198	int			error;
1199
1200	trace_xfs_alloc_file_space(ip);
1201
1202	if (XFS_FORCED_SHUTDOWN(mp))
1203		return XFS_ERROR(EIO);
1204
1205	error = xfs_qm_dqattach(ip, 0);
1206	if (error)
1207		return error;
1208
1209	if (len <= 0)
1210		return XFS_ERROR(EINVAL);
1211
1212	rt = XFS_IS_REALTIME_INODE(ip);
1213	extsz = xfs_get_extsz_hint(ip);
1214
1215	count = len;
1216	imapp = &imaps[0];
1217	nimaps = 1;
1218	startoffset_fsb	= XFS_B_TO_FSBT(mp, offset);
1219	allocatesize_fsb = XFS_B_TO_FSB(mp, count);
1220
1221	/*
1222	 * Allocate file space until done or until there is an error
1223	 */
1224	while (allocatesize_fsb && !error) {
1225		xfs_fileoff_t	s, e;
1226
1227		/*
1228		 * Determine space reservations for data/realtime.
1229		 */
1230		if (unlikely(extsz)) {
1231			s = startoffset_fsb;
1232			do_div(s, extsz);
1233			s *= extsz;
1234			e = startoffset_fsb + allocatesize_fsb;
1235			if ((temp = do_mod(startoffset_fsb, extsz)))
1236				e += temp;
1237			if ((temp = do_mod(e, extsz)))
1238				e += extsz - temp;
1239		} else {
1240			s = 0;
1241			e = allocatesize_fsb;
1242		}
1243
1244		/*
1245		 * The transaction reservation is limited to a 32-bit block
1246		 * count, hence we need to limit the number of blocks we are
1247		 * trying to reserve to avoid an overflow. We can't allocate
1248		 * more than @nimaps extents, and an extent is limited on disk
1249		 * to MAXEXTLEN (21 bits), so use that to enforce the limit.
1250		 */
1251		resblks = min_t(xfs_fileoff_t, (e - s), (MAXEXTLEN * nimaps));
1252		if (unlikely(rt)) {
1253			resrtextents = qblocks = resblks;
1254			resrtextents /= mp->m_sb.sb_rextsize;
1255			resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
1256			quota_flag = XFS_QMOPT_RES_RTBLKS;
1257		} else {
1258			resrtextents = 0;
1259			resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, resblks);
1260			quota_flag = XFS_QMOPT_RES_REGBLKS;
1261		}
1262
1263		/*
1264		 * Allocate and setup the transaction.
1265		 */
1266		tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
1267		error = xfs_trans_reserve(tp, resblks,
1268					  XFS_WRITE_LOG_RES(mp), resrtextents,
1269					  XFS_TRANS_PERM_LOG_RES,
1270					  XFS_WRITE_LOG_COUNT);
1271		/*
1272		 * Check for running out of space
1273		 */
1274		if (error) {
1275			/*
1276			 * Free the transaction structure.
1277			 */
1278			ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
1279			xfs_trans_cancel(tp, 0);
1280			break;
1281		}
1282		xfs_ilock(ip, XFS_ILOCK_EXCL);
1283		error = xfs_trans_reserve_quota_nblks(tp, ip, qblocks,
1284						      0, quota_flag);
1285		if (error)
1286			goto error1;
1287
1288		xfs_trans_ijoin(tp, ip, 0);
1289
1290		xfs_bmap_init(&free_list, &firstfsb);
1291		error = xfs_bmapi_write(tp, ip, startoffset_fsb,
1292					allocatesize_fsb, alloc_type, &firstfsb,
1293					0, imapp, &nimaps, &free_list);
1294		if (error) {
1295			goto error0;
1296		}
1297
1298		/*
1299		 * Complete the transaction
1300		 */
1301		error = xfs_bmap_finish(&tp, &free_list, &committed);
1302		if (error) {
1303			goto error0;
1304		}
1305
1306		error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1307		xfs_iunlock(ip, XFS_ILOCK_EXCL);
1308		if (error) {
1309			break;
1310		}
1311
1312		allocated_fsb = imapp->br_blockcount;
1313
1314		if (nimaps == 0) {
1315			error = XFS_ERROR(ENOSPC);
1316			break;
1317		}
1318
1319		startoffset_fsb += allocated_fsb;
1320		allocatesize_fsb -= allocated_fsb;
1321	}
1322
1323	return error;
1324
1325error0:	/* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
1326	xfs_bmap_cancel(&free_list);
1327	xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag);
1328
1329error1:	/* Just cancel transaction */
1330	xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
1331	xfs_iunlock(ip, XFS_ILOCK_EXCL);
1332	return error;
1333}
1334
1335/*
1336 * Zero file bytes between startoff and endoff inclusive.
1337 * The iolock is held exclusive and no blocks are buffered.
1338 *
1339 * This function is used by xfs_free_file_space() to zero
1340 * partial blocks when the range to free is not block aligned.
1341 * When unreserving space with boundaries that are not block
1342 * aligned we round up the start and round down the end
1343 * boundaries and then use this function to zero the parts of
1344 * the blocks that got dropped during the rounding.
1345 */
1346STATIC int
1347xfs_zero_remaining_bytes(
1348	xfs_inode_t		*ip,
1349	xfs_off_t		startoff,
1350	xfs_off_t		endoff)
1351{
1352	xfs_bmbt_irec_t		imap;
1353	xfs_fileoff_t		offset_fsb;
1354	xfs_off_t		lastoffset;
1355	xfs_off_t		offset;
1356	xfs_buf_t		*bp;
1357	xfs_mount_t		*mp = ip->i_mount;
1358	int			nimap;
1359	int			error = 0;
1360
1361	/*
1362	 * Avoid doing I/O beyond eof - it's not necessary
1363	 * since nothing can read beyond eof.  The space will
1364	 * be zeroed when the file is extended anyway.
1365	 */
1366	if (startoff >= XFS_ISIZE(ip))
1367		return 0;
1368
1369	if (endoff > XFS_ISIZE(ip))
1370		endoff = XFS_ISIZE(ip);
1371
1372	bp = xfs_buf_get_uncached(XFS_IS_REALTIME_INODE(ip) ?
1373					mp->m_rtdev_targp : mp->m_ddev_targp,
1374				  BTOBB(mp->m_sb.sb_blocksize), 0);
1375	if (!bp)
1376		return XFS_ERROR(ENOMEM);
1377
1378	xfs_buf_unlock(bp);
1379
1380	for (offset = startoff; offset <= endoff; offset = lastoffset + 1) {
1381		offset_fsb = XFS_B_TO_FSBT(mp, offset);
1382		nimap = 1;
1383		error = xfs_bmapi_read(ip, offset_fsb, 1, &imap, &nimap, 0);
1384		if (error || nimap < 1)
1385			break;
1386		ASSERT(imap.br_blockcount >= 1);
1387		ASSERT(imap.br_startoff == offset_fsb);
1388		lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1;
1389		if (lastoffset > endoff)
1390			lastoffset = endoff;
1391		if (imap.br_startblock == HOLESTARTBLOCK)
1392			continue;
1393		ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
1394		if (imap.br_state == XFS_EXT_UNWRITTEN)
1395			continue;
1396		XFS_BUF_UNDONE(bp);
1397		XFS_BUF_UNWRITE(bp);
1398		XFS_BUF_READ(bp);
1399		XFS_BUF_SET_ADDR(bp, xfs_fsb_to_db(ip, imap.br_startblock));
1400		xfsbdstrat(mp, bp);
1401		error = xfs_buf_iowait(bp);
1402		if (error) {
1403			xfs_buf_ioerror_alert(bp,
1404					"xfs_zero_remaining_bytes(read)");
1405			break;
1406		}
1407		memset(bp->b_addr +
1408			(offset - XFS_FSB_TO_B(mp, imap.br_startoff)),
1409		      0, lastoffset - offset + 1);
1410		XFS_BUF_UNDONE(bp);
1411		XFS_BUF_UNREAD(bp);
1412		XFS_BUF_WRITE(bp);
1413		xfsbdstrat(mp, bp);
1414		error = xfs_buf_iowait(bp);
1415		if (error) {
1416			xfs_buf_ioerror_alert(bp,
1417					"xfs_zero_remaining_bytes(write)");
1418			break;
1419		}
1420	}
1421	xfs_buf_free(bp);
1422	return error;
1423}
1424
1425/*
1426 * xfs_free_file_space()
1427 *      This routine frees disk space for the given file.
1428 *
1429 *	This routine is only called by xfs_change_file_space
1430 *	for an UNRESVSP type call.
1431 *
1432 * RETURNS:
1433 *       0 on success
1434 *      errno on error
1435 *
1436 */
1437STATIC int
1438xfs_free_file_space(
1439	xfs_inode_t		*ip,
1440	xfs_off_t		offset,
1441	xfs_off_t		len,
1442	int			attr_flags)
1443{
1444	int			committed;
1445	int			done;
1446	xfs_fileoff_t		endoffset_fsb;
1447	int			error;
1448	xfs_fsblock_t		firstfsb;
1449	xfs_bmap_free_t		free_list;
1450	xfs_bmbt_irec_t		imap;
1451	xfs_off_t		ioffset;
1452	xfs_extlen_t		mod=0;
1453	xfs_mount_t		*mp;
1454	int			nimap;
1455	uint			resblks;
1456	uint			rounding;
1457	int			rt;
1458	xfs_fileoff_t		startoffset_fsb;
1459	xfs_trans_t		*tp;
1460	int			need_iolock = 1;
1461
1462	mp = ip->i_mount;
1463
1464	trace_xfs_free_file_space(ip);
1465
1466	error = xfs_qm_dqattach(ip, 0);
1467	if (error)
1468		return error;
1469
1470	error = 0;
1471	if (len <= 0)	/* if nothing being freed */
1472		return error;
1473	rt = XFS_IS_REALTIME_INODE(ip);
1474	startoffset_fsb	= XFS_B_TO_FSB(mp, offset);
1475	endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len);
1476
1477	if (attr_flags & XFS_ATTR_NOLOCK)
1478		need_iolock = 0;
1479	if (need_iolock) {
1480		xfs_ilock(ip, XFS_IOLOCK_EXCL);
1481		/* wait for the completion of any pending DIOs */
1482		inode_dio_wait(VFS_I(ip));
1483	}
1484
1485	rounding = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
1486	ioffset = offset & ~(rounding - 1);
1487	error = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
1488					      ioffset, -1);
1489	if (error)
1490		goto out_unlock_iolock;
1491	truncate_pagecache_range(VFS_I(ip), ioffset, -1);
1492
1493	/*
1494	 * Need to zero the stuff we're not freeing, on disk.
1495	 * If it's a realtime file & can't use unwritten extents then we
1496	 * actually need to zero the extent edges.  Otherwise xfs_bunmapi
1497	 * will take care of it for us.
1498	 */
1499	if (rt && !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
1500		nimap = 1;
1501		error = xfs_bmapi_read(ip, startoffset_fsb, 1,
1502					&imap, &nimap, 0);
1503		if (error)
1504			goto out_unlock_iolock;
1505		ASSERT(nimap == 0 || nimap == 1);
1506		if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
1507			xfs_daddr_t	block;
1508
1509			ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
1510			block = imap.br_startblock;
1511			mod = do_div(block, mp->m_sb.sb_rextsize);
1512			if (mod)
1513				startoffset_fsb += mp->m_sb.sb_rextsize - mod;
1514		}
1515		nimap = 1;
1516		error = xfs_bmapi_read(ip, endoffset_fsb - 1, 1,
1517					&imap, &nimap, 0);
1518		if (error)
1519			goto out_unlock_iolock;
1520		ASSERT(nimap == 0 || nimap == 1);
1521		if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
1522			ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
1523			mod++;
1524			if (mod && (mod != mp->m_sb.sb_rextsize))
1525				endoffset_fsb -= mod;
1526		}
1527	}
1528	if ((done = (endoffset_fsb <= startoffset_fsb)))
1529		/*
1530		 * One contiguous piece to clear
1531		 */
1532		error = xfs_zero_remaining_bytes(ip, offset, offset + len - 1);
1533	else {
1534		/*
1535		 * Some full blocks, possibly two pieces to clear
1536		 */
1537		if (offset < XFS_FSB_TO_B(mp, startoffset_fsb))
1538			error = xfs_zero_remaining_bytes(ip, offset,
1539				XFS_FSB_TO_B(mp, startoffset_fsb) - 1);
1540		if (!error &&
1541		    XFS_FSB_TO_B(mp, endoffset_fsb) < offset + len)
1542			error = xfs_zero_remaining_bytes(ip,
1543				XFS_FSB_TO_B(mp, endoffset_fsb),
1544				offset + len - 1);
1545	}
1546
1547	/*
1548	 * free file space until done or until there is an error
1549	 */
1550	resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
1551	while (!error && !done) {
1552
1553		/*
1554		 * allocate and setup the transaction. Allow this
1555		 * transaction to dip into the reserve blocks to ensure
1556		 * the freeing of the space succeeds at ENOSPC.
1557		 */
1558		tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
1559		tp->t_flags |= XFS_TRANS_RESERVE;
1560		error = xfs_trans_reserve(tp,
1561					  resblks,
1562					  XFS_WRITE_LOG_RES(mp),
1563					  0,
1564					  XFS_TRANS_PERM_LOG_RES,
1565					  XFS_WRITE_LOG_COUNT);
1566
1567		/*
1568		 * check for running out of space
1569		 */
1570		if (error) {
1571			/*
1572			 * Free the transaction structure.
1573			 */
1574			ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
1575			xfs_trans_cancel(tp, 0);
1576			break;
1577		}
1578		xfs_ilock(ip, XFS_ILOCK_EXCL);
1579		error = xfs_trans_reserve_quota(tp, mp,
1580				ip->i_udquot, ip->i_gdquot,
1581				resblks, 0, XFS_QMOPT_RES_REGBLKS);
1582		if (error)
1583			goto error1;
1584
1585		xfs_trans_ijoin(tp, ip, 0);
1586
1587		/*
1588		 * issue the bunmapi() call to free the blocks
1589		 */
1590		xfs_bmap_init(&free_list, &firstfsb);
1591		error = xfs_bunmapi(tp, ip, startoffset_fsb,
1592				  endoffset_fsb - startoffset_fsb,
1593				  0, 2, &firstfsb, &free_list, &done);
1594		if (error) {
1595			goto error0;
1596		}
1597
1598		/*
1599		 * complete the transaction
1600		 */
1601		error = xfs_bmap_finish(&tp, &free_list, &committed);
1602		if (error) {
1603			goto error0;
1604		}
1605
1606		error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1607		xfs_iunlock(ip, XFS_ILOCK_EXCL);
1608	}
1609
1610 out_unlock_iolock:
1611	if (need_iolock)
1612		xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1613	return error;
1614
1615 error0:
1616	xfs_bmap_cancel(&free_list);
1617 error1:
1618	xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
1619	xfs_iunlock(ip, need_iolock ? (XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL) :
1620		    XFS_ILOCK_EXCL);
1621	return error;
1622}
1623
1624
1625STATIC int
1626xfs_zero_file_space(
1627	struct xfs_inode	*ip,
1628	xfs_off_t		offset,
1629	xfs_off_t		len,
1630	int			attr_flags)
1631{
1632	struct xfs_mount	*mp = ip->i_mount;
1633	uint			granularity;
1634	xfs_off_t		start_boundary;
1635	xfs_off_t		end_boundary;
1636	int			error;
1637
1638	granularity = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
1639
1640	/*
1641	 * Round the range of extents we are going to convert inwards.  If the
1642	 * offset is aligned, then it doesn't get changed so we zero from the
1643	 * start of the block offset points to.
1644	 */
1645	start_boundary = round_up(offset, granularity);
1646	end_boundary = round_down(offset + len, granularity);
1647
1648	ASSERT(start_boundary >= offset);
1649	ASSERT(end_boundary <= offset + len);
1650
1651	if (!(attr_flags & XFS_ATTR_NOLOCK))
1652		xfs_ilock(ip, XFS_IOLOCK_EXCL);
1653
1654	if (start_boundary < end_boundary - 1) {
1655		/* punch out the page cache over the conversion range */
1656		truncate_pagecache_range(VFS_I(ip), start_boundary,
1657					 end_boundary - 1);
1658		/* convert the blocks */
1659		error = xfs_alloc_file_space(ip, start_boundary,
1660					end_boundary - start_boundary - 1,
1661					XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT,
1662					attr_flags);
1663		if (error)
1664			goto out_unlock;
1665
1666		/* We've handled the interior of the range, now for the edges */
1667		if (start_boundary != offset)
1668			error = xfs_iozero(ip, offset, start_boundary - offset);
1669		if (error)
1670			goto out_unlock;
1671
1672		if (end_boundary != offset + len)
1673			error = xfs_iozero(ip, end_boundary,
1674					   offset + len - end_boundary);
1675
1676	} else {
1677		/*
1678		 * It's either a sub-granularity range or the range spanned lies
1679		 * partially across two adjacent blocks.
1680		 */
1681		error = xfs_iozero(ip, offset, len);
1682	}
1683
1684out_unlock:
1685	if (!(attr_flags & XFS_ATTR_NOLOCK))
1686		xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1687	return error;
1688
1689}
1690
1691/*
1692 * xfs_change_file_space()
1693 *      This routine allocates or frees disk space for the given file.
1694 *      The user specified parameters are checked for alignment and size
1695 *      limitations.
1696 *
1697 * RETURNS:
1698 *       0 on success
1699 *      errno on error
1700 *
1701 */
1702int
1703xfs_change_file_space(
1704	xfs_inode_t	*ip,
1705	int		cmd,
1706	xfs_flock64_t	*bf,
1707	xfs_off_t	offset,
1708	int		attr_flags)
1709{
1710	xfs_mount_t	*mp = ip->i_mount;
1711	int		clrprealloc;
1712	int		error;
1713	xfs_fsize_t	fsize;
1714	int		setprealloc;
1715	xfs_off_t	startoffset;
1716	xfs_trans_t	*tp;
1717	struct iattr	iattr;
1718
1719	if (!S_ISREG(ip->i_d.di_mode))
1720		return XFS_ERROR(EINVAL);
1721
1722	switch (bf->l_whence) {
1723	case 0: /*SEEK_SET*/
1724		break;
1725	case 1: /*SEEK_CUR*/
1726		bf->l_start += offset;
1727		break;
1728	case 2: /*SEEK_END*/
1729		bf->l_start += XFS_ISIZE(ip);
1730		break;
1731	default:
1732		return XFS_ERROR(EINVAL);
1733	}
1734
1735	/*
1736	 * length of <= 0 for resv/unresv/zero is invalid.  length for
1737	 * alloc/free is ignored completely and we have no idea what userspace
1738	 * might have set it to, so set it to zero to allow range
1739	 * checks to pass.
1740	 */
1741	switch (cmd) {
1742	case XFS_IOC_ZERO_RANGE:
1743	case XFS_IOC_RESVSP:
1744	case XFS_IOC_RESVSP64:
1745	case XFS_IOC_UNRESVSP:
1746	case XFS_IOC_UNRESVSP64:
1747		if (bf->l_len <= 0)
1748			return XFS_ERROR(EINVAL);
1749		break;
1750	default:
1751		bf->l_len = 0;
1752		break;
1753	}
1754
1755	if (bf->l_start < 0 ||
1756	    bf->l_start > mp->m_super->s_maxbytes ||
1757	    bf->l_start + bf->l_len < 0 ||
1758	    bf->l_start + bf->l_len >= mp->m_super->s_maxbytes)
1759		return XFS_ERROR(EINVAL);
1760
1761	bf->l_whence = 0;
1762
1763	startoffset = bf->l_start;
1764	fsize = XFS_ISIZE(ip);
1765
1766	setprealloc = clrprealloc = 0;
1767	switch (cmd) {
1768	case XFS_IOC_ZERO_RANGE:
1769		error = xfs_zero_file_space(ip, startoffset, bf->l_len,
1770						attr_flags);
1771		if (error)
1772			return error;
1773		setprealloc = 1;
1774		break;
1775
1776	case XFS_IOC_RESVSP:
1777	case XFS_IOC_RESVSP64:
1778		error = xfs_alloc_file_space(ip, startoffset, bf->l_len,
1779						XFS_BMAPI_PREALLOC, attr_flags);
1780		if (error)
1781			return error;
1782		setprealloc = 1;
1783		break;
1784
1785	case XFS_IOC_UNRESVSP:
1786	case XFS_IOC_UNRESVSP64:
1787		if ((error = xfs_free_file_space(ip, startoffset, bf->l_len,
1788								attr_flags)))
1789			return error;
1790		break;
1791
1792	case XFS_IOC_ALLOCSP:
1793	case XFS_IOC_ALLOCSP64:
1794	case XFS_IOC_FREESP:
1795	case XFS_IOC_FREESP64:
1796		/*
1797		 * These operations actually do IO when extending the file, but
1798		 * the allocation is done seperately to the zeroing that is
1799		 * done. This set of operations need to be serialised against
1800		 * other IO operations, such as truncate and buffered IO. We
1801		 * need to take the IOLOCK here to serialise the allocation and
1802		 * zeroing IO to prevent other IOLOCK holders (e.g. getbmap,
1803		 * truncate, direct IO) from racing against the transient
1804		 * allocated but not written state we can have here.
1805		 */
1806		xfs_ilock(ip, XFS_IOLOCK_EXCL);
1807		if (startoffset > fsize) {
1808			error = xfs_alloc_file_space(ip, fsize,
1809					startoffset - fsize, 0,
1810					attr_flags | XFS_ATTR_NOLOCK);
1811			if (error) {
1812				xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1813				break;
1814			}
1815		}
1816
1817		iattr.ia_valid = ATTR_SIZE;
1818		iattr.ia_size = startoffset;
1819
1820		error = xfs_setattr_size(ip, &iattr,
1821					 attr_flags | XFS_ATTR_NOLOCK);
1822		xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1823
1824		if (error)
1825			return error;
1826
1827		clrprealloc = 1;
1828		break;
1829
1830	default:
1831		ASSERT(0);
1832		return XFS_ERROR(EINVAL);
1833	}
1834
1835	/*
1836	 * update the inode timestamp, mode, and prealloc flag bits
1837	 */
1838	tp = xfs_trans_alloc(mp, XFS_TRANS_WRITEID);
1839
1840	if ((error = xfs_trans_reserve(tp, 0, XFS_WRITEID_LOG_RES(mp),
1841				      0, 0, 0))) {
1842		/* ASSERT(0); */
1843		xfs_trans_cancel(tp, 0);
1844		return error;
1845	}
1846
1847	xfs_ilock(ip, XFS_ILOCK_EXCL);
1848	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
1849
1850	if ((attr_flags & XFS_ATTR_DMI) == 0) {
1851		ip->i_d.di_mode &= ~S_ISUID;
1852
1853		/*
1854		 * Note that we don't have to worry about mandatory
1855		 * file locking being disabled here because we only
1856		 * clear the S_ISGID bit if the Group execute bit is
1857		 * on, but if it was on then mandatory locking wouldn't
1858		 * have been enabled.
1859		 */
1860		if (ip->i_d.di_mode & S_IXGRP)
1861			ip->i_d.di_mode &= ~S_ISGID;
1862
1863		xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
1864	}
1865	if (setprealloc)
1866		ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
1867	else if (clrprealloc)
1868		ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC;
1869
1870	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1871	if (attr_flags & XFS_ATTR_SYNC)
1872		xfs_trans_set_sync(tp);
1873	return xfs_trans_commit(tp, 0);
1874}