PageRenderTime 28ms CodeModel.GetById 34ms RepoModel.GetById 0ms app.codeStats 0ms

/fs/xfs/xfs_bmap_util.c

http://github.com/torvalds/linux
C | 1777 lines | 1139 code | 231 blank | 407 comment | 254 complexity | 1e93b3e9340316a68b4a33ee2d5b2d6c MD5 | raw file
Possible License(s): LGPL-2.0, AGPL-1.0, GPL-2.0
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * Copyright (c) 2000-2006 Silicon Graphics, Inc.
  4. * Copyright (c) 2012 Red Hat, Inc.
  5. * All Rights Reserved.
  6. */
  7. #include "xfs.h"
  8. #include "xfs_fs.h"
  9. #include "xfs_shared.h"
  10. #include "xfs_format.h"
  11. #include "xfs_log_format.h"
  12. #include "xfs_trans_resv.h"
  13. #include "xfs_bit.h"
  14. #include "xfs_mount.h"
  15. #include "xfs_defer.h"
  16. #include "xfs_inode.h"
  17. #include "xfs_btree.h"
  18. #include "xfs_trans.h"
  19. #include "xfs_alloc.h"
  20. #include "xfs_bmap.h"
  21. #include "xfs_bmap_util.h"
  22. #include "xfs_bmap_btree.h"
  23. #include "xfs_rtalloc.h"
  24. #include "xfs_error.h"
  25. #include "xfs_quota.h"
  26. #include "xfs_trans_space.h"
  27. #include "xfs_trace.h"
  28. #include "xfs_icache.h"
  29. #include "xfs_iomap.h"
  30. #include "xfs_reflink.h"
  31. /* Kernel only BMAP related definitions and functions */
  32. /*
  33. * Convert the given file system block to a disk block. We have to treat it
  34. * differently based on whether the file is a real time file or not, because the
  35. * bmap code does.
  36. */
  37. xfs_daddr_t
  38. xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb)
  39. {
  40. if (XFS_IS_REALTIME_INODE(ip))
  41. return XFS_FSB_TO_BB(ip->i_mount, fsb);
  42. return XFS_FSB_TO_DADDR(ip->i_mount, fsb);
  43. }
  44. /*
  45. * Routine to zero an extent on disk allocated to the specific inode.
  46. *
  47. * The VFS functions take a linearised filesystem block offset, so we have to
  48. * convert the sparse xfs fsb to the right format first.
  49. * VFS types are real funky, too.
  50. */
  51. int
  52. xfs_zero_extent(
  53. struct xfs_inode *ip,
  54. xfs_fsblock_t start_fsb,
  55. xfs_off_t count_fsb)
  56. {
  57. struct xfs_mount *mp = ip->i_mount;
  58. struct xfs_buftarg *target = xfs_inode_buftarg(ip);
  59. xfs_daddr_t sector = xfs_fsb_to_db(ip, start_fsb);
  60. sector_t block = XFS_BB_TO_FSBT(mp, sector);
  61. return blkdev_issue_zeroout(target->bt_bdev,
  62. block << (mp->m_super->s_blocksize_bits - 9),
  63. count_fsb << (mp->m_super->s_blocksize_bits - 9),
  64. GFP_NOFS, 0);
  65. }
  66. #ifdef CONFIG_XFS_RT
  67. int
  68. xfs_bmap_rtalloc(
  69. struct xfs_bmalloca *ap) /* bmap alloc argument struct */
  70. {
  71. int error; /* error return value */
  72. xfs_mount_t *mp; /* mount point structure */
  73. xfs_extlen_t prod = 0; /* product factor for allocators */
  74. xfs_extlen_t mod = 0; /* product factor for allocators */
  75. xfs_extlen_t ralen = 0; /* realtime allocation length */
  76. xfs_extlen_t align; /* minimum allocation alignment */
  77. xfs_rtblock_t rtb;
  78. mp = ap->ip->i_mount;
  79. align = xfs_get_extsz_hint(ap->ip);
  80. prod = align / mp->m_sb.sb_rextsize;
  81. error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev,
  82. align, 1, ap->eof, 0,
  83. ap->conv, &ap->offset, &ap->length);
  84. if (error)
  85. return error;
  86. ASSERT(ap->length);
  87. ASSERT(ap->length % mp->m_sb.sb_rextsize == 0);
  88. /*
  89. * If the offset & length are not perfectly aligned
  90. * then kill prod, it will just get us in trouble.
  91. */
  92. div_u64_rem(ap->offset, align, &mod);
  93. if (mod || ap->length % align)
  94. prod = 1;
  95. /*
  96. * Set ralen to be the actual requested length in rtextents.
  97. */
  98. ralen = ap->length / mp->m_sb.sb_rextsize;
  99. /*
  100. * If the old value was close enough to MAXEXTLEN that
  101. * we rounded up to it, cut it back so it's valid again.
  102. * Note that if it's a really large request (bigger than
  103. * MAXEXTLEN), we don't hear about that number, and can't
  104. * adjust the starting point to match it.
  105. */
  106. if (ralen * mp->m_sb.sb_rextsize >= MAXEXTLEN)
  107. ralen = MAXEXTLEN / mp->m_sb.sb_rextsize;
  108. /*
  109. * Lock out modifications to both the RT bitmap and summary inodes
  110. */
  111. xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL|XFS_ILOCK_RTBITMAP);
  112. xfs_trans_ijoin(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL);
  113. xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL|XFS_ILOCK_RTSUM);
  114. xfs_trans_ijoin(ap->tp, mp->m_rsumip, XFS_ILOCK_EXCL);
  115. /*
  116. * If it's an allocation to an empty file at offset 0,
  117. * pick an extent that will space things out in the rt area.
  118. */
  119. if (ap->eof && ap->offset == 0) {
  120. xfs_rtblock_t uninitialized_var(rtx); /* realtime extent no */
  121. error = xfs_rtpick_extent(mp, ap->tp, ralen, &rtx);
  122. if (error)
  123. return error;
  124. ap->blkno = rtx * mp->m_sb.sb_rextsize;
  125. } else {
  126. ap->blkno = 0;
  127. }
  128. xfs_bmap_adjacent(ap);
  129. /*
  130. * Realtime allocation, done through xfs_rtallocate_extent.
  131. */
  132. do_div(ap->blkno, mp->m_sb.sb_rextsize);
  133. rtb = ap->blkno;
  134. ap->length = ralen;
  135. error = xfs_rtallocate_extent(ap->tp, ap->blkno, 1, ap->length,
  136. &ralen, ap->wasdel, prod, &rtb);
  137. if (error)
  138. return error;
  139. ap->blkno = rtb;
  140. if (ap->blkno != NULLFSBLOCK) {
  141. ap->blkno *= mp->m_sb.sb_rextsize;
  142. ralen *= mp->m_sb.sb_rextsize;
  143. ap->length = ralen;
  144. ap->ip->i_d.di_nblocks += ralen;
  145. xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
  146. if (ap->wasdel)
  147. ap->ip->i_delayed_blks -= ralen;
  148. /*
  149. * Adjust the disk quota also. This was reserved
  150. * earlier.
  151. */
  152. xfs_trans_mod_dquot_byino(ap->tp, ap->ip,
  153. ap->wasdel ? XFS_TRANS_DQ_DELRTBCOUNT :
  154. XFS_TRANS_DQ_RTBCOUNT, (long) ralen);
  155. } else {
  156. ap->length = 0;
  157. }
  158. return 0;
  159. }
  160. #endif /* CONFIG_XFS_RT */
  161. /*
  162. * Extent tree block counting routines.
  163. */
  164. /*
  165. * Count leaf blocks given a range of extent records. Delayed allocation
  166. * extents are not counted towards the totals.
  167. */
  168. xfs_extnum_t
  169. xfs_bmap_count_leaves(
  170. struct xfs_ifork *ifp,
  171. xfs_filblks_t *count)
  172. {
  173. struct xfs_iext_cursor icur;
  174. struct xfs_bmbt_irec got;
  175. xfs_extnum_t numrecs = 0;
  176. for_each_xfs_iext(ifp, &icur, &got) {
  177. if (!isnullstartblock(got.br_startblock)) {
  178. *count += got.br_blockcount;
  179. numrecs++;
  180. }
  181. }
  182. return numrecs;
  183. }
  184. /*
  185. * Count fsblocks of the given fork. Delayed allocation extents are
  186. * not counted towards the totals.
  187. */
  188. int
  189. xfs_bmap_count_blocks(
  190. struct xfs_trans *tp,
  191. struct xfs_inode *ip,
  192. int whichfork,
  193. xfs_extnum_t *nextents,
  194. xfs_filblks_t *count)
  195. {
  196. struct xfs_mount *mp = ip->i_mount;
  197. struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
  198. struct xfs_btree_cur *cur;
  199. xfs_extlen_t btblocks = 0;
  200. int error;
  201. *nextents = 0;
  202. *count = 0;
  203. if (!ifp)
  204. return 0;
  205. switch (XFS_IFORK_FORMAT(ip, whichfork)) {
  206. case XFS_DINODE_FMT_BTREE:
  207. if (!(ifp->if_flags & XFS_IFEXTENTS)) {
  208. error = xfs_iread_extents(tp, ip, whichfork);
  209. if (error)
  210. return error;
  211. }
  212. cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
  213. error = xfs_btree_count_blocks(cur, &btblocks);
  214. xfs_btree_del_cursor(cur, error);
  215. if (error)
  216. return error;
  217. /*
  218. * xfs_btree_count_blocks includes the root block contained in
  219. * the inode fork in @btblocks, so subtract one because we're
  220. * only interested in allocated disk blocks.
  221. */
  222. *count += btblocks - 1;
  223. /* fall through */
  224. case XFS_DINODE_FMT_EXTENTS:
  225. *nextents = xfs_bmap_count_leaves(ifp, count);
  226. break;
  227. }
  228. return 0;
  229. }
  230. static int
  231. xfs_getbmap_report_one(
  232. struct xfs_inode *ip,
  233. struct getbmapx *bmv,
  234. struct kgetbmap *out,
  235. int64_t bmv_end,
  236. struct xfs_bmbt_irec *got)
  237. {
  238. struct kgetbmap *p = out + bmv->bmv_entries;
  239. bool shared = false;
  240. int error;
  241. error = xfs_reflink_trim_around_shared(ip, got, &shared);
  242. if (error)
  243. return error;
  244. if (isnullstartblock(got->br_startblock) ||
  245. got->br_startblock == DELAYSTARTBLOCK) {
  246. /*
  247. * Delalloc extents that start beyond EOF can occur due to
  248. * speculative EOF allocation when the delalloc extent is larger
  249. * than the largest freespace extent at conversion time. These
  250. * extents cannot be converted by data writeback, so can exist
  251. * here even if we are not supposed to be finding delalloc
  252. * extents.
  253. */
  254. if (got->br_startoff < XFS_B_TO_FSB(ip->i_mount, XFS_ISIZE(ip)))
  255. ASSERT((bmv->bmv_iflags & BMV_IF_DELALLOC) != 0);
  256. p->bmv_oflags |= BMV_OF_DELALLOC;
  257. p->bmv_block = -2;
  258. } else {
  259. p->bmv_block = xfs_fsb_to_db(ip, got->br_startblock);
  260. }
  261. if (got->br_state == XFS_EXT_UNWRITTEN &&
  262. (bmv->bmv_iflags & BMV_IF_PREALLOC))
  263. p->bmv_oflags |= BMV_OF_PREALLOC;
  264. if (shared)
  265. p->bmv_oflags |= BMV_OF_SHARED;
  266. p->bmv_offset = XFS_FSB_TO_BB(ip->i_mount, got->br_startoff);
  267. p->bmv_length = XFS_FSB_TO_BB(ip->i_mount, got->br_blockcount);
  268. bmv->bmv_offset = p->bmv_offset + p->bmv_length;
  269. bmv->bmv_length = max(0LL, bmv_end - bmv->bmv_offset);
  270. bmv->bmv_entries++;
  271. return 0;
  272. }
  273. static void
  274. xfs_getbmap_report_hole(
  275. struct xfs_inode *ip,
  276. struct getbmapx *bmv,
  277. struct kgetbmap *out,
  278. int64_t bmv_end,
  279. xfs_fileoff_t bno,
  280. xfs_fileoff_t end)
  281. {
  282. struct kgetbmap *p = out + bmv->bmv_entries;
  283. if (bmv->bmv_iflags & BMV_IF_NO_HOLES)
  284. return;
  285. p->bmv_block = -1;
  286. p->bmv_offset = XFS_FSB_TO_BB(ip->i_mount, bno);
  287. p->bmv_length = XFS_FSB_TO_BB(ip->i_mount, end - bno);
  288. bmv->bmv_offset = p->bmv_offset + p->bmv_length;
  289. bmv->bmv_length = max(0LL, bmv_end - bmv->bmv_offset);
  290. bmv->bmv_entries++;
  291. }
  292. static inline bool
  293. xfs_getbmap_full(
  294. struct getbmapx *bmv)
  295. {
  296. return bmv->bmv_length == 0 || bmv->bmv_entries >= bmv->bmv_count - 1;
  297. }
  298. static bool
  299. xfs_getbmap_next_rec(
  300. struct xfs_bmbt_irec *rec,
  301. xfs_fileoff_t total_end)
  302. {
  303. xfs_fileoff_t end = rec->br_startoff + rec->br_blockcount;
  304. if (end == total_end)
  305. return false;
  306. rec->br_startoff += rec->br_blockcount;
  307. if (!isnullstartblock(rec->br_startblock) &&
  308. rec->br_startblock != DELAYSTARTBLOCK)
  309. rec->br_startblock += rec->br_blockcount;
  310. rec->br_blockcount = total_end - end;
  311. return true;
  312. }
  313. /*
  314. * Get inode's extents as described in bmv, and format for output.
  315. * Calls formatter to fill the user's buffer until all extents
  316. * are mapped, until the passed-in bmv->bmv_count slots have
  317. * been filled, or until the formatter short-circuits the loop,
  318. * if it is tracking filled-in extents on its own.
  319. */
  320. int /* error code */
  321. xfs_getbmap(
  322. struct xfs_inode *ip,
  323. struct getbmapx *bmv, /* user bmap structure */
  324. struct kgetbmap *out)
  325. {
  326. struct xfs_mount *mp = ip->i_mount;
  327. int iflags = bmv->bmv_iflags;
  328. int whichfork, lock, error = 0;
  329. int64_t bmv_end, max_len;
  330. xfs_fileoff_t bno, first_bno;
  331. struct xfs_ifork *ifp;
  332. struct xfs_bmbt_irec got, rec;
  333. xfs_filblks_t len;
  334. struct xfs_iext_cursor icur;
  335. if (bmv->bmv_iflags & ~BMV_IF_VALID)
  336. return -EINVAL;
  337. #ifndef DEBUG
  338. /* Only allow CoW fork queries if we're debugging. */
  339. if (iflags & BMV_IF_COWFORK)
  340. return -EINVAL;
  341. #endif
  342. if ((iflags & BMV_IF_ATTRFORK) && (iflags & BMV_IF_COWFORK))
  343. return -EINVAL;
  344. if (bmv->bmv_length < -1)
  345. return -EINVAL;
  346. bmv->bmv_entries = 0;
  347. if (bmv->bmv_length == 0)
  348. return 0;
  349. if (iflags & BMV_IF_ATTRFORK)
  350. whichfork = XFS_ATTR_FORK;
  351. else if (iflags & BMV_IF_COWFORK)
  352. whichfork = XFS_COW_FORK;
  353. else
  354. whichfork = XFS_DATA_FORK;
  355. ifp = XFS_IFORK_PTR(ip, whichfork);
  356. xfs_ilock(ip, XFS_IOLOCK_SHARED);
  357. switch (whichfork) {
  358. case XFS_ATTR_FORK:
  359. if (!XFS_IFORK_Q(ip))
  360. goto out_unlock_iolock;
  361. max_len = 1LL << 32;
  362. lock = xfs_ilock_attr_map_shared(ip);
  363. break;
  364. case XFS_COW_FORK:
  365. /* No CoW fork? Just return */
  366. if (!ifp)
  367. goto out_unlock_iolock;
  368. if (xfs_get_cowextsz_hint(ip))
  369. max_len = mp->m_super->s_maxbytes;
  370. else
  371. max_len = XFS_ISIZE(ip);
  372. lock = XFS_ILOCK_SHARED;
  373. xfs_ilock(ip, lock);
  374. break;
  375. case XFS_DATA_FORK:
  376. if (!(iflags & BMV_IF_DELALLOC) &&
  377. (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size)) {
  378. error = filemap_write_and_wait(VFS_I(ip)->i_mapping);
  379. if (error)
  380. goto out_unlock_iolock;
  381. /*
  382. * Even after flushing the inode, there can still be
  383. * delalloc blocks on the inode beyond EOF due to
  384. * speculative preallocation. These are not removed
  385. * until the release function is called or the inode
  386. * is inactivated. Hence we cannot assert here that
  387. * ip->i_delayed_blks == 0.
  388. */
  389. }
  390. if (xfs_get_extsz_hint(ip) ||
  391. (ip->i_d.di_flags &
  392. (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))
  393. max_len = mp->m_super->s_maxbytes;
  394. else
  395. max_len = XFS_ISIZE(ip);
  396. lock = xfs_ilock_data_map_shared(ip);
  397. break;
  398. }
  399. switch (XFS_IFORK_FORMAT(ip, whichfork)) {
  400. case XFS_DINODE_FMT_EXTENTS:
  401. case XFS_DINODE_FMT_BTREE:
  402. break;
  403. case XFS_DINODE_FMT_LOCAL:
  404. /* Local format inode forks report no extents. */
  405. goto out_unlock_ilock;
  406. default:
  407. error = -EINVAL;
  408. goto out_unlock_ilock;
  409. }
  410. if (bmv->bmv_length == -1) {
  411. max_len = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, max_len));
  412. bmv->bmv_length = max(0LL, max_len - bmv->bmv_offset);
  413. }
  414. bmv_end = bmv->bmv_offset + bmv->bmv_length;
  415. first_bno = bno = XFS_BB_TO_FSBT(mp, bmv->bmv_offset);
  416. len = XFS_BB_TO_FSB(mp, bmv->bmv_length);
  417. if (!(ifp->if_flags & XFS_IFEXTENTS)) {
  418. error = xfs_iread_extents(NULL, ip, whichfork);
  419. if (error)
  420. goto out_unlock_ilock;
  421. }
  422. if (!xfs_iext_lookup_extent(ip, ifp, bno, &icur, &got)) {
  423. /*
  424. * Report a whole-file hole if the delalloc flag is set to
  425. * stay compatible with the old implementation.
  426. */
  427. if (iflags & BMV_IF_DELALLOC)
  428. xfs_getbmap_report_hole(ip, bmv, out, bmv_end, bno,
  429. XFS_B_TO_FSB(mp, XFS_ISIZE(ip)));
  430. goto out_unlock_ilock;
  431. }
  432. while (!xfs_getbmap_full(bmv)) {
  433. xfs_trim_extent(&got, first_bno, len);
  434. /*
  435. * Report an entry for a hole if this extent doesn't directly
  436. * follow the previous one.
  437. */
  438. if (got.br_startoff > bno) {
  439. xfs_getbmap_report_hole(ip, bmv, out, bmv_end, bno,
  440. got.br_startoff);
  441. if (xfs_getbmap_full(bmv))
  442. break;
  443. }
  444. /*
  445. * In order to report shared extents accurately, we report each
  446. * distinct shared / unshared part of a single bmbt record with
  447. * an individual getbmapx record.
  448. */
  449. bno = got.br_startoff + got.br_blockcount;
  450. rec = got;
  451. do {
  452. error = xfs_getbmap_report_one(ip, bmv, out, bmv_end,
  453. &rec);
  454. if (error || xfs_getbmap_full(bmv))
  455. goto out_unlock_ilock;
  456. } while (xfs_getbmap_next_rec(&rec, bno));
  457. if (!xfs_iext_next_extent(ifp, &icur, &got)) {
  458. xfs_fileoff_t end = XFS_B_TO_FSB(mp, XFS_ISIZE(ip));
  459. out[bmv->bmv_entries - 1].bmv_oflags |= BMV_OF_LAST;
  460. if (whichfork != XFS_ATTR_FORK && bno < end &&
  461. !xfs_getbmap_full(bmv)) {
  462. xfs_getbmap_report_hole(ip, bmv, out, bmv_end,
  463. bno, end);
  464. }
  465. break;
  466. }
  467. if (bno >= first_bno + len)
  468. break;
  469. }
  470. out_unlock_ilock:
  471. xfs_iunlock(ip, lock);
  472. out_unlock_iolock:
  473. xfs_iunlock(ip, XFS_IOLOCK_SHARED);
  474. return error;
  475. }
  476. /*
  477. * Dead simple method of punching delalyed allocation blocks from a range in
  478. * the inode. This will always punch out both the start and end blocks, even
  479. * if the ranges only partially overlap them, so it is up to the caller to
  480. * ensure that partial blocks are not passed in.
  481. */
  482. int
  483. xfs_bmap_punch_delalloc_range(
  484. struct xfs_inode *ip,
  485. xfs_fileoff_t start_fsb,
  486. xfs_fileoff_t length)
  487. {
  488. struct xfs_ifork *ifp = &ip->i_df;
  489. xfs_fileoff_t end_fsb = start_fsb + length;
  490. struct xfs_bmbt_irec got, del;
  491. struct xfs_iext_cursor icur;
  492. int error = 0;
  493. ASSERT(ifp->if_flags & XFS_IFEXTENTS);
  494. xfs_ilock(ip, XFS_ILOCK_EXCL);
  495. if (!xfs_iext_lookup_extent_before(ip, ifp, &end_fsb, &icur, &got))
  496. goto out_unlock;
  497. while (got.br_startoff + got.br_blockcount > start_fsb) {
  498. del = got;
  499. xfs_trim_extent(&del, start_fsb, length);
  500. /*
  501. * A delete can push the cursor forward. Step back to the
  502. * previous extent on non-delalloc or extents outside the
  503. * target range.
  504. */
  505. if (!del.br_blockcount ||
  506. !isnullstartblock(del.br_startblock)) {
  507. if (!xfs_iext_prev_extent(ifp, &icur, &got))
  508. break;
  509. continue;
  510. }
  511. error = xfs_bmap_del_extent_delay(ip, XFS_DATA_FORK, &icur,
  512. &got, &del);
  513. if (error || !xfs_iext_get_extent(ifp, &icur, &got))
  514. break;
  515. }
  516. out_unlock:
  517. xfs_iunlock(ip, XFS_ILOCK_EXCL);
  518. return error;
  519. }
  520. /*
  521. * Test whether it is appropriate to check an inode for and free post EOF
  522. * blocks. The 'force' parameter determines whether we should also consider
  523. * regular files that are marked preallocated or append-only.
  524. */
  525. bool
  526. xfs_can_free_eofblocks(struct xfs_inode *ip, bool force)
  527. {
  528. /* prealloc/delalloc exists only on regular files */
  529. if (!S_ISREG(VFS_I(ip)->i_mode))
  530. return false;
  531. /*
  532. * Zero sized files with no cached pages and delalloc blocks will not
  533. * have speculative prealloc/delalloc blocks to remove.
  534. */
  535. if (VFS_I(ip)->i_size == 0 &&
  536. VFS_I(ip)->i_mapping->nrpages == 0 &&
  537. ip->i_delayed_blks == 0)
  538. return false;
  539. /* If we haven't read in the extent list, then don't do it now. */
  540. if (!(ip->i_df.if_flags & XFS_IFEXTENTS))
  541. return false;
  542. /*
  543. * Do not free real preallocated or append-only files unless the file
  544. * has delalloc blocks and we are forced to remove them.
  545. */
  546. if (ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND))
  547. if (!force || ip->i_delayed_blks == 0)
  548. return false;
  549. return true;
  550. }
  551. /*
  552. * This is called to free any blocks beyond eof. The caller must hold
  553. * IOLOCK_EXCL unless we are in the inode reclaim path and have the only
  554. * reference to the inode.
  555. */
  556. int
  557. xfs_free_eofblocks(
  558. struct xfs_inode *ip)
  559. {
  560. struct xfs_trans *tp;
  561. int error;
  562. xfs_fileoff_t end_fsb;
  563. xfs_fileoff_t last_fsb;
  564. xfs_filblks_t map_len;
  565. int nimaps;
  566. struct xfs_bmbt_irec imap;
  567. struct xfs_mount *mp = ip->i_mount;
  568. /*
  569. * Figure out if there are any blocks beyond the end
  570. * of the file. If not, then there is nothing to do.
  571. */
  572. end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_ISIZE(ip));
  573. last_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
  574. if (last_fsb <= end_fsb)
  575. return 0;
  576. map_len = last_fsb - end_fsb;
  577. nimaps = 1;
  578. xfs_ilock(ip, XFS_ILOCK_SHARED);
  579. error = xfs_bmapi_read(ip, end_fsb, map_len, &imap, &nimaps, 0);
  580. xfs_iunlock(ip, XFS_ILOCK_SHARED);
  581. /*
  582. * If there are blocks after the end of file, truncate the file to its
  583. * current size to free them up.
  584. */
  585. if (!error && (nimaps != 0) &&
  586. (imap.br_startblock != HOLESTARTBLOCK ||
  587. ip->i_delayed_blks)) {
  588. /*
  589. * Attach the dquots to the inode up front.
  590. */
  591. error = xfs_qm_dqattach(ip);
  592. if (error)
  593. return error;
  594. /* wait on dio to ensure i_size has settled */
  595. inode_dio_wait(VFS_I(ip));
  596. error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0,
  597. &tp);
  598. if (error) {
  599. ASSERT(XFS_FORCED_SHUTDOWN(mp));
  600. return error;
  601. }
  602. xfs_ilock(ip, XFS_ILOCK_EXCL);
  603. xfs_trans_ijoin(tp, ip, 0);
  604. /*
  605. * Do not update the on-disk file size. If we update the
  606. * on-disk file size and then the system crashes before the
  607. * contents of the file are flushed to disk then the files
  608. * may be full of holes (ie NULL files bug).
  609. */
  610. error = xfs_itruncate_extents_flags(&tp, ip, XFS_DATA_FORK,
  611. XFS_ISIZE(ip), XFS_BMAPI_NODISCARD);
  612. if (error) {
  613. /*
  614. * If we get an error at this point we simply don't
  615. * bother truncating the file.
  616. */
  617. xfs_trans_cancel(tp);
  618. } else {
  619. error = xfs_trans_commit(tp);
  620. if (!error)
  621. xfs_inode_clear_eofblocks_tag(ip);
  622. }
  623. xfs_iunlock(ip, XFS_ILOCK_EXCL);
  624. }
  625. return error;
  626. }
  627. int
  628. xfs_alloc_file_space(
  629. struct xfs_inode *ip,
  630. xfs_off_t offset,
  631. xfs_off_t len,
  632. int alloc_type)
  633. {
  634. xfs_mount_t *mp = ip->i_mount;
  635. xfs_off_t count;
  636. xfs_filblks_t allocated_fsb;
  637. xfs_filblks_t allocatesize_fsb;
  638. xfs_extlen_t extsz, temp;
  639. xfs_fileoff_t startoffset_fsb;
  640. xfs_fileoff_t endoffset_fsb;
  641. int nimaps;
  642. int quota_flag;
  643. int rt;
  644. xfs_trans_t *tp;
  645. xfs_bmbt_irec_t imaps[1], *imapp;
  646. uint qblocks, resblks, resrtextents;
  647. int error;
  648. trace_xfs_alloc_file_space(ip);
  649. if (XFS_FORCED_SHUTDOWN(mp))
  650. return -EIO;
  651. error = xfs_qm_dqattach(ip);
  652. if (error)
  653. return error;
  654. if (len <= 0)
  655. return -EINVAL;
  656. rt = XFS_IS_REALTIME_INODE(ip);
  657. extsz = xfs_get_extsz_hint(ip);
  658. count = len;
  659. imapp = &imaps[0];
  660. nimaps = 1;
  661. startoffset_fsb = XFS_B_TO_FSBT(mp, offset);
  662. endoffset_fsb = XFS_B_TO_FSB(mp, offset + count);
  663. allocatesize_fsb = endoffset_fsb - startoffset_fsb;
  664. /*
  665. * Allocate file space until done or until there is an error
  666. */
  667. while (allocatesize_fsb && !error) {
  668. xfs_fileoff_t s, e;
  669. /*
  670. * Determine space reservations for data/realtime.
  671. */
  672. if (unlikely(extsz)) {
  673. s = startoffset_fsb;
  674. do_div(s, extsz);
  675. s *= extsz;
  676. e = startoffset_fsb + allocatesize_fsb;
  677. div_u64_rem(startoffset_fsb, extsz, &temp);
  678. if (temp)
  679. e += temp;
  680. div_u64_rem(e, extsz, &temp);
  681. if (temp)
  682. e += extsz - temp;
  683. } else {
  684. s = 0;
  685. e = allocatesize_fsb;
  686. }
  687. /*
  688. * The transaction reservation is limited to a 32-bit block
  689. * count, hence we need to limit the number of blocks we are
  690. * trying to reserve to avoid an overflow. We can't allocate
  691. * more than @nimaps extents, and an extent is limited on disk
  692. * to MAXEXTLEN (21 bits), so use that to enforce the limit.
  693. */
  694. resblks = min_t(xfs_fileoff_t, (e - s), (MAXEXTLEN * nimaps));
  695. if (unlikely(rt)) {
  696. resrtextents = qblocks = resblks;
  697. resrtextents /= mp->m_sb.sb_rextsize;
  698. resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
  699. quota_flag = XFS_QMOPT_RES_RTBLKS;
  700. } else {
  701. resrtextents = 0;
  702. resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, resblks);
  703. quota_flag = XFS_QMOPT_RES_REGBLKS;
  704. }
  705. /*
  706. * Allocate and setup the transaction.
  707. */
  708. error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks,
  709. resrtextents, 0, &tp);
  710. /*
  711. * Check for running out of space
  712. */
  713. if (error) {
  714. /*
  715. * Free the transaction structure.
  716. */
  717. ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp));
  718. break;
  719. }
  720. xfs_ilock(ip, XFS_ILOCK_EXCL);
  721. error = xfs_trans_reserve_quota_nblks(tp, ip, qblocks,
  722. 0, quota_flag);
  723. if (error)
  724. goto error1;
  725. xfs_trans_ijoin(tp, ip, 0);
  726. error = xfs_bmapi_write(tp, ip, startoffset_fsb,
  727. allocatesize_fsb, alloc_type, 0, imapp,
  728. &nimaps);
  729. if (error)
  730. goto error0;
  731. /*
  732. * Complete the transaction
  733. */
  734. error = xfs_trans_commit(tp);
  735. xfs_iunlock(ip, XFS_ILOCK_EXCL);
  736. if (error)
  737. break;
  738. allocated_fsb = imapp->br_blockcount;
  739. if (nimaps == 0) {
  740. error = -ENOSPC;
  741. break;
  742. }
  743. startoffset_fsb += allocated_fsb;
  744. allocatesize_fsb -= allocated_fsb;
  745. }
  746. return error;
  747. error0: /* unlock inode, unreserve quota blocks, cancel trans */
  748. xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag);
  749. error1: /* Just cancel transaction */
  750. xfs_trans_cancel(tp);
  751. xfs_iunlock(ip, XFS_ILOCK_EXCL);
  752. return error;
  753. }
  754. static int
  755. xfs_unmap_extent(
  756. struct xfs_inode *ip,
  757. xfs_fileoff_t startoffset_fsb,
  758. xfs_filblks_t len_fsb,
  759. int *done)
  760. {
  761. struct xfs_mount *mp = ip->i_mount;
  762. struct xfs_trans *tp;
  763. uint resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
  764. int error;
  765. error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp);
  766. if (error) {
  767. ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp));
  768. return error;
  769. }
  770. xfs_ilock(ip, XFS_ILOCK_EXCL);
  771. error = xfs_trans_reserve_quota(tp, mp, ip->i_udquot, ip->i_gdquot,
  772. ip->i_pdquot, resblks, 0, XFS_QMOPT_RES_REGBLKS);
  773. if (error)
  774. goto out_trans_cancel;
  775. xfs_trans_ijoin(tp, ip, 0);
  776. error = xfs_bunmapi(tp, ip, startoffset_fsb, len_fsb, 0, 2, done);
  777. if (error)
  778. goto out_trans_cancel;
  779. error = xfs_trans_commit(tp);
  780. out_unlock:
  781. xfs_iunlock(ip, XFS_ILOCK_EXCL);
  782. return error;
  783. out_trans_cancel:
  784. xfs_trans_cancel(tp);
  785. goto out_unlock;
  786. }
  787. /* Caller must first wait for the completion of any pending DIOs if required. */
  788. int
  789. xfs_flush_unmap_range(
  790. struct xfs_inode *ip,
  791. xfs_off_t offset,
  792. xfs_off_t len)
  793. {
  794. struct xfs_mount *mp = ip->i_mount;
  795. struct inode *inode = VFS_I(ip);
  796. xfs_off_t rounding, start, end;
  797. int error;
  798. rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_SIZE);
  799. start = round_down(offset, rounding);
  800. end = round_up(offset + len, rounding) - 1;
  801. error = filemap_write_and_wait_range(inode->i_mapping, start, end);
  802. if (error)
  803. return error;
  804. truncate_pagecache_range(inode, start, end);
  805. return 0;
  806. }
  807. int
  808. xfs_free_file_space(
  809. struct xfs_inode *ip,
  810. xfs_off_t offset,
  811. xfs_off_t len)
  812. {
  813. struct xfs_mount *mp = ip->i_mount;
  814. xfs_fileoff_t startoffset_fsb;
  815. xfs_fileoff_t endoffset_fsb;
  816. int done = 0, error;
  817. trace_xfs_free_file_space(ip);
  818. error = xfs_qm_dqattach(ip);
  819. if (error)
  820. return error;
  821. if (len <= 0) /* if nothing being freed */
  822. return 0;
  823. startoffset_fsb = XFS_B_TO_FSB(mp, offset);
  824. endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len);
  825. /*
  826. * Need to zero the stuff we're not freeing, on disk.
  827. */
  828. if (endoffset_fsb > startoffset_fsb) {
  829. while (!done) {
  830. error = xfs_unmap_extent(ip, startoffset_fsb,
  831. endoffset_fsb - startoffset_fsb, &done);
  832. if (error)
  833. return error;
  834. }
  835. }
  836. /*
  837. * Now that we've unmap all full blocks we'll have to zero out any
  838. * partial block at the beginning and/or end. iomap_zero_range is smart
  839. * enough to skip any holes, including those we just created, but we
  840. * must take care not to zero beyond EOF and enlarge i_size.
  841. */
  842. if (offset >= XFS_ISIZE(ip))
  843. return 0;
  844. if (offset + len > XFS_ISIZE(ip))
  845. len = XFS_ISIZE(ip) - offset;
  846. error = iomap_zero_range(VFS_I(ip), offset, len, NULL,
  847. &xfs_buffered_write_iomap_ops);
  848. if (error)
  849. return error;
  850. /*
  851. * If we zeroed right up to EOF and EOF straddles a page boundary we
  852. * must make sure that the post-EOF area is also zeroed because the
  853. * page could be mmap'd and iomap_zero_range doesn't do that for us.
  854. * Writeback of the eof page will do this, albeit clumsily.
  855. */
  856. if (offset + len >= XFS_ISIZE(ip) && offset_in_page(offset + len) > 0) {
  857. error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
  858. round_down(offset + len, PAGE_SIZE), LLONG_MAX);
  859. }
  860. return error;
  861. }
  862. static int
  863. xfs_prepare_shift(
  864. struct xfs_inode *ip,
  865. loff_t offset)
  866. {
  867. struct xfs_mount *mp = ip->i_mount;
  868. int error;
  869. /*
  870. * Trim eofblocks to avoid shifting uninitialized post-eof preallocation
  871. * into the accessible region of the file.
  872. */
  873. if (xfs_can_free_eofblocks(ip, true)) {
  874. error = xfs_free_eofblocks(ip);
  875. if (error)
  876. return error;
  877. }
  878. /*
  879. * Shift operations must stabilize the start block offset boundary along
  880. * with the full range of the operation. If we don't, a COW writeback
  881. * completion could race with an insert, front merge with the start
  882. * extent (after split) during the shift and corrupt the file. Start
  883. * with the block just prior to the start to stabilize the boundary.
  884. */
  885. offset = round_down(offset, 1 << mp->m_sb.sb_blocklog);
  886. if (offset)
  887. offset -= (1 << mp->m_sb.sb_blocklog);
  888. /*
  889. * Writeback and invalidate cache for the remainder of the file as we're
  890. * about to shift down every extent from offset to EOF.
  891. */
  892. error = xfs_flush_unmap_range(ip, offset, XFS_ISIZE(ip));
  893. if (error)
  894. return error;
  895. /*
  896. * Clean out anything hanging around in the cow fork now that
  897. * we've flushed all the dirty data out to disk to avoid having
  898. * CoW extents at the wrong offsets.
  899. */
  900. if (xfs_inode_has_cow_data(ip)) {
  901. error = xfs_reflink_cancel_cow_range(ip, offset, NULLFILEOFF,
  902. true);
  903. if (error)
  904. return error;
  905. }
  906. return 0;
  907. }
  908. /*
  909. * xfs_collapse_file_space()
  910. * This routine frees disk space and shift extent for the given file.
  911. * The first thing we do is to free data blocks in the specified range
  912. * by calling xfs_free_file_space(). It would also sync dirty data
  913. * and invalidate page cache over the region on which collapse range
  914. * is working. And Shift extent records to the left to cover a hole.
  915. * RETURNS:
  916. * 0 on success
  917. * errno on error
  918. *
  919. */
  920. int
  921. xfs_collapse_file_space(
  922. struct xfs_inode *ip,
  923. xfs_off_t offset,
  924. xfs_off_t len)
  925. {
  926. struct xfs_mount *mp = ip->i_mount;
  927. struct xfs_trans *tp;
  928. int error;
  929. xfs_fileoff_t next_fsb = XFS_B_TO_FSB(mp, offset + len);
  930. xfs_fileoff_t shift_fsb = XFS_B_TO_FSB(mp, len);
  931. bool done = false;
  932. ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
  933. ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL));
  934. trace_xfs_collapse_file_space(ip);
  935. error = xfs_free_file_space(ip, offset, len);
  936. if (error)
  937. return error;
  938. error = xfs_prepare_shift(ip, offset);
  939. if (error)
  940. return error;
  941. error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, 0, &tp);
  942. if (error)
  943. return error;
  944. xfs_ilock(ip, XFS_ILOCK_EXCL);
  945. xfs_trans_ijoin(tp, ip, 0);
  946. while (!done) {
  947. error = xfs_bmap_collapse_extents(tp, ip, &next_fsb, shift_fsb,
  948. &done);
  949. if (error)
  950. goto out_trans_cancel;
  951. if (done)
  952. break;
  953. /* finish any deferred frees and roll the transaction */
  954. error = xfs_defer_finish(&tp);
  955. if (error)
  956. goto out_trans_cancel;
  957. }
  958. error = xfs_trans_commit(tp);
  959. xfs_iunlock(ip, XFS_ILOCK_EXCL);
  960. return error;
  961. out_trans_cancel:
  962. xfs_trans_cancel(tp);
  963. xfs_iunlock(ip, XFS_ILOCK_EXCL);
  964. return error;
  965. }
  966. /*
  967. * xfs_insert_file_space()
  968. * This routine create hole space by shifting extents for the given file.
  969. * The first thing we do is to sync dirty data and invalidate page cache
  970. * over the region on which insert range is working. And split an extent
  971. * to two extents at given offset by calling xfs_bmap_split_extent.
  972. * And shift all extent records which are laying between [offset,
  973. * last allocated extent] to the right to reserve hole range.
  974. * RETURNS:
  975. * 0 on success
  976. * errno on error
  977. */
  978. int
  979. xfs_insert_file_space(
  980. struct xfs_inode *ip,
  981. loff_t offset,
  982. loff_t len)
  983. {
  984. struct xfs_mount *mp = ip->i_mount;
  985. struct xfs_trans *tp;
  986. int error;
  987. xfs_fileoff_t stop_fsb = XFS_B_TO_FSB(mp, offset);
  988. xfs_fileoff_t next_fsb = NULLFSBLOCK;
  989. xfs_fileoff_t shift_fsb = XFS_B_TO_FSB(mp, len);
  990. bool done = false;
  991. ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
  992. ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL));
  993. trace_xfs_insert_file_space(ip);
  994. error = xfs_bmap_can_insert_extents(ip, stop_fsb, shift_fsb);
  995. if (error)
  996. return error;
  997. error = xfs_prepare_shift(ip, offset);
  998. if (error)
  999. return error;
  1000. error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write,
  1001. XFS_DIOSTRAT_SPACE_RES(mp, 0), 0, 0, &tp);
  1002. if (error)
  1003. return error;
  1004. xfs_ilock(ip, XFS_ILOCK_EXCL);
  1005. xfs_trans_ijoin(tp, ip, 0);
  1006. /*
  1007. * The extent shifting code works on extent granularity. So, if stop_fsb
  1008. * is not the starting block of extent, we need to split the extent at
  1009. * stop_fsb.
  1010. */
  1011. error = xfs_bmap_split_extent(tp, ip, stop_fsb);
  1012. if (error)
  1013. goto out_trans_cancel;
  1014. do {
  1015. error = xfs_trans_roll_inode(&tp, ip);
  1016. if (error)
  1017. goto out_trans_cancel;
  1018. error = xfs_bmap_insert_extents(tp, ip, &next_fsb, shift_fsb,
  1019. &done, stop_fsb);
  1020. if (error)
  1021. goto out_trans_cancel;
  1022. } while (!done);
  1023. error = xfs_trans_commit(tp);
  1024. xfs_iunlock(ip, XFS_ILOCK_EXCL);
  1025. return error;
  1026. out_trans_cancel:
  1027. xfs_trans_cancel(tp);
  1028. xfs_iunlock(ip, XFS_ILOCK_EXCL);
  1029. return error;
  1030. }
  1031. /*
  1032. * We need to check that the format of the data fork in the temporary inode is
  1033. * valid for the target inode before doing the swap. This is not a problem with
  1034. * attr1 because of the fixed fork offset, but attr2 has a dynamically sized
  1035. * data fork depending on the space the attribute fork is taking so we can get
  1036. * invalid formats on the target inode.
  1037. *
  1038. * E.g. target has space for 7 extents in extent format, temp inode only has
  1039. * space for 6. If we defragment down to 7 extents, then the tmp format is a
  1040. * btree, but when swapped it needs to be in extent format. Hence we can't just
  1041. * blindly swap data forks on attr2 filesystems.
  1042. *
  1043. * Note that we check the swap in both directions so that we don't end up with
  1044. * a corrupt temporary inode, either.
  1045. *
  1046. * Note that fixing the way xfs_fsr sets up the attribute fork in the source
  1047. * inode will prevent this situation from occurring, so all we do here is
  1048. * reject and log the attempt. basically we are putting the responsibility on
  1049. * userspace to get this right.
  1050. */
  1051. static int
  1052. xfs_swap_extents_check_format(
  1053. struct xfs_inode *ip, /* target inode */
  1054. struct xfs_inode *tip) /* tmp inode */
  1055. {
  1056. /* Should never get a local format */
  1057. if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL ||
  1058. tip->i_d.di_format == XFS_DINODE_FMT_LOCAL)
  1059. return -EINVAL;
  1060. /*
  1061. * if the target inode has less extents that then temporary inode then
  1062. * why did userspace call us?
  1063. */
  1064. if (ip->i_d.di_nextents < tip->i_d.di_nextents)
  1065. return -EINVAL;
  1066. /*
  1067. * If we have to use the (expensive) rmap swap method, we can
  1068. * handle any number of extents and any format.
  1069. */
  1070. if (xfs_sb_version_hasrmapbt(&ip->i_mount->m_sb))
  1071. return 0;
  1072. /*
  1073. * if the target inode is in extent form and the temp inode is in btree
  1074. * form then we will end up with the target inode in the wrong format
  1075. * as we already know there are less extents in the temp inode.
  1076. */
  1077. if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
  1078. tip->i_d.di_format == XFS_DINODE_FMT_BTREE)
  1079. return -EINVAL;
  1080. /* Check temp in extent form to max in target */
  1081. if (tip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
  1082. XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) >
  1083. XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
  1084. return -EINVAL;
  1085. /* Check target in extent form to max in temp */
  1086. if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
  1087. XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) >
  1088. XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
  1089. return -EINVAL;
  1090. /*
  1091. * If we are in a btree format, check that the temp root block will fit
  1092. * in the target and that it has enough extents to be in btree format
  1093. * in the target.
  1094. *
  1095. * Note that we have to be careful to allow btree->extent conversions
  1096. * (a common defrag case) which will occur when the temp inode is in
  1097. * extent format...
  1098. */
  1099. if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
  1100. if (XFS_IFORK_Q(ip) &&
  1101. XFS_BMAP_BMDR_SPACE(tip->i_df.if_broot) > XFS_IFORK_BOFF(ip))
  1102. return -EINVAL;
  1103. if (XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <=
  1104. XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
  1105. return -EINVAL;
  1106. }
  1107. /* Reciprocal target->temp btree format checks */
  1108. if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
  1109. if (XFS_IFORK_Q(tip) &&
  1110. XFS_BMAP_BMDR_SPACE(ip->i_df.if_broot) > XFS_IFORK_BOFF(tip))
  1111. return -EINVAL;
  1112. if (XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <=
  1113. XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
  1114. return -EINVAL;
  1115. }
  1116. return 0;
  1117. }
  1118. static int
  1119. xfs_swap_extent_flush(
  1120. struct xfs_inode *ip)
  1121. {
  1122. int error;
  1123. error = filemap_write_and_wait(VFS_I(ip)->i_mapping);
  1124. if (error)
  1125. return error;
  1126. truncate_pagecache_range(VFS_I(ip), 0, -1);
  1127. /* Verify O_DIRECT for ftmp */
  1128. if (VFS_I(ip)->i_mapping->nrpages)
  1129. return -EINVAL;
  1130. return 0;
  1131. }
  1132. /*
  1133. * Move extents from one file to another, when rmap is enabled.
  1134. */
  1135. STATIC int
  1136. xfs_swap_extent_rmap(
  1137. struct xfs_trans **tpp,
  1138. struct xfs_inode *ip,
  1139. struct xfs_inode *tip)
  1140. {
  1141. struct xfs_trans *tp = *tpp;
  1142. struct xfs_bmbt_irec irec;
  1143. struct xfs_bmbt_irec uirec;
  1144. struct xfs_bmbt_irec tirec;
  1145. xfs_fileoff_t offset_fsb;
  1146. xfs_fileoff_t end_fsb;
  1147. xfs_filblks_t count_fsb;
  1148. int error;
  1149. xfs_filblks_t ilen;
  1150. xfs_filblks_t rlen;
  1151. int nimaps;
  1152. uint64_t tip_flags2;
  1153. /*
  1154. * If the source file has shared blocks, we must flag the donor
  1155. * file as having shared blocks so that we get the shared-block
  1156. * rmap functions when we go to fix up the rmaps. The flags
  1157. * will be switch for reals later.
  1158. */
  1159. tip_flags2 = tip->i_d.di_flags2;
  1160. if (ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK)
  1161. tip->i_d.di_flags2 |= XFS_DIFLAG2_REFLINK;
  1162. offset_fsb = 0;
  1163. end_fsb = XFS_B_TO_FSB(ip->i_mount, i_size_read(VFS_I(ip)));
  1164. count_fsb = (xfs_filblks_t)(end_fsb - offset_fsb);
  1165. while (count_fsb) {
  1166. /* Read extent from the donor file */
  1167. nimaps = 1;
  1168. error = xfs_bmapi_read(tip, offset_fsb, count_fsb, &tirec,
  1169. &nimaps, 0);
  1170. if (error)
  1171. goto out;
  1172. ASSERT(nimaps == 1);
  1173. ASSERT(tirec.br_startblock != DELAYSTARTBLOCK);
  1174. trace_xfs_swap_extent_rmap_remap(tip, &tirec);
  1175. ilen = tirec.br_blockcount;
  1176. /* Unmap the old blocks in the source file. */
  1177. while (tirec.br_blockcount) {
  1178. ASSERT(tp->t_firstblock == NULLFSBLOCK);
  1179. trace_xfs_swap_extent_rmap_remap_piece(tip, &tirec);
  1180. /* Read extent from the source file */
  1181. nimaps = 1;
  1182. error = xfs_bmapi_read(ip, tirec.br_startoff,
  1183. tirec.br_blockcount, &irec,
  1184. &nimaps, 0);
  1185. if (error)
  1186. goto out;
  1187. ASSERT(nimaps == 1);
  1188. ASSERT(tirec.br_startoff == irec.br_startoff);
  1189. trace_xfs_swap_extent_rmap_remap_piece(ip, &irec);
  1190. /* Trim the extent. */
  1191. uirec = tirec;
  1192. uirec.br_blockcount = rlen = min_t(xfs_filblks_t,
  1193. tirec.br_blockcount,
  1194. irec.br_blockcount);
  1195. trace_xfs_swap_extent_rmap_remap_piece(tip, &uirec);
  1196. /* Remove the mapping from the donor file. */
  1197. xfs_bmap_unmap_extent(tp, tip, &uirec);
  1198. /* Remove the mapping from the source file. */
  1199. xfs_bmap_unmap_extent(tp, ip, &irec);
  1200. /* Map the donor file's blocks into the source file. */
  1201. xfs_bmap_map_extent(tp, ip, &uirec);
  1202. /* Map the source file's blocks into the donor file. */
  1203. xfs_bmap_map_extent(tp, tip, &irec);
  1204. error = xfs_defer_finish(tpp);
  1205. tp = *tpp;
  1206. if (error)
  1207. goto out;
  1208. tirec.br_startoff += rlen;
  1209. if (tirec.br_startblock != HOLESTARTBLOCK &&
  1210. tirec.br_startblock != DELAYSTARTBLOCK)
  1211. tirec.br_startblock += rlen;
  1212. tirec.br_blockcount -= rlen;
  1213. }
  1214. /* Roll on... */
  1215. count_fsb -= ilen;
  1216. offset_fsb += ilen;
  1217. }
  1218. tip->i_d.di_flags2 = tip_flags2;
  1219. return 0;
  1220. out:
  1221. trace_xfs_swap_extent_rmap_error(ip, error, _RET_IP_);
  1222. tip->i_d.di_flags2 = tip_flags2;
  1223. return error;
  1224. }
  1225. /* Swap the extents of two files by swapping data forks. */
  1226. STATIC int
  1227. xfs_swap_extent_forks(
  1228. struct xfs_trans *tp,
  1229. struct xfs_inode *ip,
  1230. struct xfs_inode *tip,
  1231. int *src_log_flags,
  1232. int *target_log_flags)
  1233. {
  1234. xfs_filblks_t aforkblks = 0;
  1235. xfs_filblks_t taforkblks = 0;
  1236. xfs_extnum_t junk;
  1237. uint64_t tmp;
  1238. int error;
  1239. /*
  1240. * Count the number of extended attribute blocks
  1241. */
  1242. if ( ((XFS_IFORK_Q(ip) != 0) && (ip->i_d.di_anextents > 0)) &&
  1243. (ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
  1244. error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, &junk,
  1245. &aforkblks);
  1246. if (error)
  1247. return error;
  1248. }
  1249. if ( ((XFS_IFORK_Q(tip) != 0) && (tip->i_d.di_anextents > 0)) &&
  1250. (tip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
  1251. error = xfs_bmap_count_blocks(tp, tip, XFS_ATTR_FORK, &junk,
  1252. &taforkblks);
  1253. if (error)
  1254. return error;
  1255. }
  1256. /*
  1257. * Btree format (v3) inodes have the inode number stamped in the bmbt
  1258. * block headers. We can't start changing the bmbt blocks until the
  1259. * inode owner change is logged so recovery does the right thing in the
  1260. * event of a crash. Set the owner change log flags now and leave the
  1261. * bmbt scan as the last step.
  1262. */
  1263. if (xfs_sb_version_has_v3inode(&ip->i_mount->m_sb)) {
  1264. if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE)
  1265. (*target_log_flags) |= XFS_ILOG_DOWNER;
  1266. if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE)
  1267. (*src_log_flags) |= XFS_ILOG_DOWNER;
  1268. }
  1269. /*
  1270. * Swap the data forks of the inodes
  1271. */
  1272. swap(ip->i_df, tip->i_df);
  1273. /*
  1274. * Fix the on-disk inode values
  1275. */
  1276. tmp = (uint64_t)ip->i_d.di_nblocks;
  1277. ip->i_d.di_nblocks = tip->i_d.di_nblocks - taforkblks + aforkblks;
  1278. tip->i_d.di_nblocks = tmp + taforkblks - aforkblks;
  1279. swap(ip->i_d.di_nextents, tip->i_d.di_nextents);
  1280. swap(ip->i_d.di_format, tip->i_d.di_format);
  1281. /*
  1282. * The extents in the source inode could still contain speculative
  1283. * preallocation beyond EOF (e.g. the file is open but not modified
  1284. * while defrag is in progress). In that case, we need to copy over the
  1285. * number of delalloc blocks the data fork in the source inode is
  1286. * tracking beyond EOF so that when the fork is truncated away when the
  1287. * temporary inode is unlinked we don't underrun the i_delayed_blks
  1288. * counter on that inode.
  1289. */
  1290. ASSERT(tip->i_delayed_blks == 0);
  1291. tip->i_delayed_blks = ip->i_delayed_blks;
  1292. ip->i_delayed_blks = 0;
  1293. switch (ip->i_d.di_format) {
  1294. case XFS_DINODE_FMT_EXTENTS:
  1295. (*src_log_flags) |= XFS_ILOG_DEXT;
  1296. break;
  1297. case XFS_DINODE_FMT_BTREE:
  1298. ASSERT(!xfs_sb_version_has_v3inode(&ip->i_mount->m_sb) ||
  1299. (*src_log_flags & XFS_ILOG_DOWNER));
  1300. (*src_log_flags) |= XFS_ILOG_DBROOT;
  1301. break;
  1302. }
  1303. switch (tip->i_d.di_format) {
  1304. case XFS_DINODE_FMT_EXTENTS:
  1305. (*target_log_flags) |= XFS_ILOG_DEXT;
  1306. break;
  1307. case XFS_DINODE_FMT_BTREE:
  1308. (*target_log_flags) |= XFS_ILOG_DBROOT;
  1309. ASSERT(!xfs_sb_version_has_v3inode(&ip->i_mount->m_sb) ||
  1310. (*target_log_flags & XFS_ILOG_DOWNER));
  1311. break;
  1312. }
  1313. return 0;
  1314. }
  1315. /*
  1316. * Fix up the owners of the bmbt blocks to refer to the current inode. The
  1317. * change owner scan attempts to order all modified buffers in the current
  1318. * transaction. In the event of ordered buffer failure, the offending buffer is
  1319. * physically logged as a fallback and the scan returns -EAGAIN. We must roll
  1320. * the transaction in this case to replenish the fallback log reservation and
  1321. * restart the scan. This process repeats until the scan completes.
  1322. */
  1323. static int
  1324. xfs_swap_change_owner(
  1325. struct xfs_trans **tpp,
  1326. struct xfs_inode *ip,
  1327. struct xfs_inode *tmpip)
  1328. {
  1329. int error;
  1330. struct xfs_trans *tp = *tpp;
  1331. do {
  1332. error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK, ip->i_ino,
  1333. NULL);
  1334. /* success or fatal error */
  1335. if (error != -EAGAIN)
  1336. break;
  1337. error = xfs_trans_roll(tpp);
  1338. if (error)
  1339. break;
  1340. tp = *tpp;
  1341. /*
  1342. * Redirty both inodes so they can relog and keep the log tail
  1343. * moving forward.
  1344. */
  1345. xfs_trans_ijoin(tp, ip, 0);
  1346. xfs_trans_ijoin(tp, tmpip, 0);
  1347. xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
  1348. xfs_trans_log_inode(tp, tmpip, XFS_ILOG_CORE);
  1349. } while (true);
  1350. return error;
  1351. }
  1352. int
  1353. xfs_swap_extents(
  1354. struct xfs_inode *ip, /* target inode */
  1355. struct xfs_inode *tip, /* tmp inode */
  1356. struct xfs_swapext *sxp)
  1357. {
  1358. struct xfs_mount *mp = ip->i_mount;
  1359. struct xfs_trans *tp;
  1360. struct xfs_bstat *sbp = &sxp->sx_stat;
  1361. int src_log_flags, target_log_flags;
  1362. int error = 0;
  1363. int lock_flags;
  1364. uint64_t f;
  1365. int resblks = 0;
  1366. /*
  1367. * Lock the inodes against other IO, page faults and truncate to
  1368. * begin with. Then we can ensure the inodes are flushed and have no
  1369. * page cache safely. Once we have done this we can take the ilocks and
  1370. * do the rest of the checks.
  1371. */
  1372. lock_two_nondirectories(VFS_I(ip), VFS_I(tip));
  1373. lock_flags = XFS_MMAPLOCK_EXCL;
  1374. xfs_lock_two_inodes(ip, XFS_MMAPLOCK_EXCL, tip, XFS_MMAPLOCK_EXCL);
  1375. /* Verify that both files have the same format */
  1376. if ((VFS_I(ip)->i_mode & S_IFMT) != (VFS_I(tip)->i_mode & S_IFMT)) {
  1377. error = -EINVAL;
  1378. goto out_unlock;
  1379. }
  1380. /* Verify both files are either real-time or non-realtime */
  1381. if (XFS_IS_REALTIME_INODE(ip) != XFS_IS_REALTIME_INODE(tip)) {
  1382. error = -EINVAL;
  1383. goto out_unlock;
  1384. }
  1385. error = xfs_qm_dqattach(ip);
  1386. if (error)
  1387. goto out_unlock;
  1388. error = xfs_qm_dqattach(tip);
  1389. if (error)
  1390. goto out_unlock;
  1391. error = xfs_swap_extent_flush(ip);
  1392. if (error)
  1393. goto out_unlock;
  1394. error = xfs_swap_extent_flush(tip);
  1395. if (error)
  1396. goto out_unlock;
  1397. if (xfs_inode_has_cow_data(tip)) {
  1398. error = xfs_reflink_cancel_cow_range(tip, 0, NULLFILEOFF, true);
  1399. if (error)
  1400. return error;
  1401. }
  1402. /*
  1403. * Extent "swapping" with rmap requires a permanent reservation and
  1404. * a block reservation because it's really just a remap operation
  1405. * performed with log redo items!
  1406. */
  1407. if (xfs_sb_version_hasrmapbt(&mp->m_sb)) {
  1408. int w = XFS_DATA_FORK;
  1409. uint32_t ipnext = XFS_IFORK_NEXTENTS(ip, w);
  1410. uint32_t tipnext = XFS_IFORK_NEXTENTS(tip, w);
  1411. /*
  1412. * Conceptually this shouldn't affect the shape of either bmbt,
  1413. * but since we atomically move extents one by one, we reserve
  1414. * enough space to rebuild both trees.
  1415. */
  1416. resblks = XFS_SWAP_RMAP_SPACE_RES(mp, ipnext, w);
  1417. resblks += XFS_SWAP_RMAP_SPACE_RES(mp, tipnext, w);
  1418. /*
  1419. * Handle the corner case where either inode might straddle the
  1420. * btree format boundary. If so, the inode could bounce between
  1421. * btree <-> extent format on unmap -> remap cycles, freeing and
  1422. * allocating a bmapbt block each time.
  1423. */
  1424. if (ipnext == (XFS_IFORK_MAXEXT(ip, w) + 1))
  1425. resblks += XFS_IFORK_MAXEXT(ip, w);
  1426. if (tipnext == (XFS_IFORK_MAXEXT(tip, w) + 1))
  1427. resblks += XFS_IFORK_MAXEXT(tip, w);
  1428. }
  1429. error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp);
  1430. if (error)
  1431. goto out_unlock;
  1432. /*
  1433. * Lock and join the inodes to the tansaction so that transaction commit
  1434. * or cancel will unlock the inodes from this point onwards.
  1435. */
  1436. xfs_lock_two_inodes(ip, XFS_ILOCK_EXCL, tip, XFS_ILOCK_EXCL);
  1437. lock_flags |= XFS_ILOCK_EXCL;
  1438. xfs_trans_ijoin(tp, ip, 0);
  1439. xfs_trans_ijoin(tp, tip, 0);
  1440. /* Verify all data are being swapped */
  1441. if (sxp->sx_offset != 0 ||
  1442. sxp->sx_length != ip->i_d.di_size ||
  1443. sxp->sx_length != tip->i_d.di_size) {
  1444. error = -EFAULT;
  1445. goto out_trans_cancel;
  1446. }
  1447. trace_xfs_swap_extent_before(ip, 0);
  1448. trace_xfs_swap_extent_before(tip, 1);
  1449. /* check inode formats now that data is flushed */
  1450. error = xfs_swap_extents_check_format(ip, tip);
  1451. if (error) {
  1452. xfs_notice(mp,
  1453. "%s: inode 0x%llx format is incompatible for exchanging.",
  1454. __func__, ip->i_ino);
  1455. goto out_trans_cancel;
  1456. }
  1457. /*
  1458. * Compare the current change & modify times with that
  1459. * passed in. If they differ, we abort this swap.
  1460. * This is the mechanism used to ensure the calling
  1461. * process that the file was not changed out from
  1462. * under it.
  1463. */
  1464. if ((sbp->bs_ctime.tv_sec != VFS_I(ip)->i_ctime.tv_sec) ||
  1465. (sbp->bs_ctime.tv_nsec != VFS_I(ip)->i_ctime.tv_nsec) ||
  1466. (sbp->bs_mtime.tv_sec != VFS_I(ip)->i_mtime.tv_sec) ||
  1467. (sbp->bs_mtime.tv_nsec != VFS_I(ip)->i_mtime.tv_nsec)) {
  1468. error = -EBUSY;
  1469. goto out_trans_cancel;
  1470. }
  1471. /*
  1472. * Note the trickiness in setting the log flags - we set the owner log
  1473. * flag on the opposite inode (i.e. the inode we are setting the new
  1474. * owner to be) because once we swap the forks and log that, log
  1475. * recovery is going to see the fork as owned by the swapped inode,
  1476. * not the pre-swapped inodes.
  1477. */
  1478. src_log_flags = XFS_ILOG_CORE;
  1479. target_log_flags = XFS_ILOG_CORE;
  1480. if (xfs_sb_version_hasrmapbt(&mp->m_sb))
  1481. error = xfs_swap_extent_rmap(&tp, ip, tip);
  1482. else
  1483. error = xfs_swap_extent_forks(tp, ip, tip, &src_log_flags,
  1484. &target_log_flags);
  1485. if (error)
  1486. goto out_trans_cancel;
  1487. /* Do we have to swap reflink flags? */
  1488. if ((ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK) ^
  1489. (tip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK)) {
  1490. f = ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK;
  1491. ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK;
  1492. ip->i_d.di_flags2 |= tip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK;
  1493. tip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK;
  1494. tip->i_d.di_flags2 |= f & XFS_DIFLAG2_REFLINK;
  1495. }
  1496. /* Swap the cow forks. */
  1497. if (xfs_sb_version_hasreflink(&mp->m_sb)) {
  1498. ASSERT(ip->i_cformat == XFS_DINODE_FMT_EXTENTS);
  1499. ASSERT(tip->i_cformat == XFS_DINODE_FMT_EXTENTS);
  1500. swap(ip->i_cnextents, tip->i_cnextents);
  1501. swap(ip->i_cowfp, tip->i_cowfp);
  1502. if (ip->i_cowfp && ip->i_cowfp->if_bytes)
  1503. xfs_inode_set_cowblocks_tag(ip);
  1504. else
  1505. xfs_inode_clear_cowblocks_tag(ip);
  1506. if (tip->i_cowfp && tip->i_cowfp->if_bytes)
  1507. xfs_inode_set_cowblocks_tag(tip);
  1508. else
  1509. xfs_inode_clear_cowblocks_tag(tip);
  1510. }
  1511. xfs_trans_log_inode(tp, ip, src_log_flags);
  1512. xfs_trans_log_inode(tp, tip, target_log_flags);
  1513. /*
  1514. * The extent forks have been swapped, but crc=1,rmapbt=0 filesystems
  1515. * have inode number owner values in the bmbt blocks that still refer to
  1516. * the old inode. Scan each bmbt to fix up the owner values with the
  1517. * inode number of the current inode.
  1518. */
  1519. if (src_log_flags & XFS_ILOG_DOWNER) {
  1520. error = xfs_swap_change_owner(&tp, ip, tip);
  1521. if (error)
  1522. goto out_trans_cancel;
  1523. }
  1524. if (target_log_flags & XFS_ILOG_DOWNER) {
  1525. error = xfs_swap_change_owner(&tp, tip, ip);
  1526. if (error)
  1527. goto out_trans_cancel;
  1528. }
  1529. /*
  1530. * If this is a synchronous mount, make sure that the
  1531. * transaction goes to disk before returning to the user.
  1532. */
  1533. if (mp->m_flags & XFS_MOUNT_WSYNC)
  1534. xfs_trans_set_sync(tp);
  1535. error = xfs_trans_commit(tp);
  1536. trace_xfs_swap_extent_after(ip, 0);
  1537. trace_xfs_swap_extent_after(tip, 1);
  1538. out_unlock:
  1539. xfs_iunlock(ip, lock_flags);
  1540. xfs_iunlock(tip, lock_flags);
  1541. unlock_two_nondirectories(VFS_I(ip), VFS_I(tip));
  1542. return error;
  1543. out_trans_cancel:
  1544. xfs_trans_cancel(tp);
  1545. goto out_unlock;
  1546. }