/kern_oII/fs/ext4/inode.c

http://omnia2droid.googlecode.com/ · C · 5311 lines · 3084 code · 516 blank · 1711 comment · 687 complexity · 3d0bd10e47eb9df9ca13e63b4c381ce5 MD5 · raw file

Large files are truncated click here to view the full file

  1. /*
  2. * linux/fs/ext4/inode.c
  3. *
  4. * Copyright (C) 1992, 1993, 1994, 1995
  5. * Remy Card (card@masi.ibp.fr)
  6. * Laboratoire MASI - Institut Blaise Pascal
  7. * Universite Pierre et Marie Curie (Paris VI)
  8. *
  9. * from
  10. *
  11. * linux/fs/minix/inode.c
  12. *
  13. * Copyright (C) 1991, 1992 Linus Torvalds
  14. *
  15. * Goal-directed block allocation by Stephen Tweedie
  16. * (sct@redhat.com), 1993, 1998
  17. * Big-endian to little-endian byte-swapping/bitmaps by
  18. * David S. Miller (davem@caip.rutgers.edu), 1995
  19. * 64-bit file support on 64-bit platforms by Jakub Jelinek
  20. * (jj@sunsite.ms.mff.cuni.cz)
  21. *
  22. * Assorted race fixes, rewrite of ext4_get_block() by Al Viro, 2000
  23. */
  24. #include <linux/module.h>
  25. #include <linux/fs.h>
  26. #include <linux/time.h>
  27. #include <linux/jbd2.h>
  28. #include <linux/highuid.h>
  29. #include <linux/pagemap.h>
  30. #include <linux/quotaops.h>
  31. #include <linux/string.h>
  32. #include <linux/buffer_head.h>
  33. #include <linux/writeback.h>
  34. #include <linux/pagevec.h>
  35. #include <linux/mpage.h>
  36. #include <linux/namei.h>
  37. #include <linux/uio.h>
  38. #include <linux/bio.h>
  39. #include "ext4_jbd2.h"
  40. #include "xattr.h"
  41. #include "acl.h"
  42. #include "ext4_extents.h"
  43. #include <trace/events/ext4.h>
  44. #define MPAGE_DA_EXTENT_TAIL 0x01
  45. static inline int ext4_begin_ordered_truncate(struct inode *inode,
  46. loff_t new_size)
  47. {
  48. return jbd2_journal_begin_ordered_truncate(
  49. EXT4_SB(inode->i_sb)->s_journal,
  50. &EXT4_I(inode)->jinode,
  51. new_size);
  52. }
  53. static void ext4_invalidatepage(struct page *page, unsigned long offset);
  54. /*
  55. * Test whether an inode is a fast symlink.
  56. */
  57. static int ext4_inode_is_fast_symlink(struct inode *inode)
  58. {
  59. int ea_blocks = EXT4_I(inode)->i_file_acl ?
  60. (inode->i_sb->s_blocksize >> 9) : 0;
  61. return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0);
  62. }
  63. /*
  64. * The ext4 forget function must perform a revoke if we are freeing data
  65. * which has been journaled. Metadata (eg. indirect blocks) must be
  66. * revoked in all cases.
  67. *
  68. * "bh" may be NULL: a metadata block may have been freed from memory
  69. * but there may still be a record of it in the journal, and that record
  70. * still needs to be revoked.
  71. *
  72. * If the handle isn't valid we're not journaling, but we still need to
  73. * call into ext4_journal_revoke() to put the buffer head.
  74. */
  75. int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
  76. struct buffer_head *bh, ext4_fsblk_t blocknr)
  77. {
  78. int err;
  79. might_sleep();
  80. BUFFER_TRACE(bh, "enter");
  81. jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
  82. "data mode %x\n",
  83. bh, is_metadata, inode->i_mode,
  84. test_opt(inode->i_sb, DATA_FLAGS));
  85. /* Never use the revoke function if we are doing full data
  86. * journaling: there is no need to, and a V1 superblock won't
  87. * support it. Otherwise, only skip the revoke on un-journaled
  88. * data blocks. */
  89. if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
  90. (!is_metadata && !ext4_should_journal_data(inode))) {
  91. if (bh) {
  92. BUFFER_TRACE(bh, "call jbd2_journal_forget");
  93. return ext4_journal_forget(handle, bh);
  94. }
  95. return 0;
  96. }
  97. /*
  98. * data!=journal && (is_metadata || should_journal_data(inode))
  99. */
  100. BUFFER_TRACE(bh, "call ext4_journal_revoke");
  101. err = ext4_journal_revoke(handle, blocknr, bh);
  102. if (err)
  103. ext4_abort(inode->i_sb, __func__,
  104. "error %d when attempting revoke", err);
  105. BUFFER_TRACE(bh, "exit");
  106. return err;
  107. }
  108. /*
  109. * Work out how many blocks we need to proceed with the next chunk of a
  110. * truncate transaction.
  111. */
  112. static unsigned long blocks_for_truncate(struct inode *inode)
  113. {
  114. ext4_lblk_t needed;
  115. needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);
  116. /* Give ourselves just enough room to cope with inodes in which
  117. * i_blocks is corrupt: we've seen disk corruptions in the past
  118. * which resulted in random data in an inode which looked enough
  119. * like a regular file for ext4 to try to delete it. Things
  120. * will go a bit crazy if that happens, but at least we should
  121. * try not to panic the whole kernel. */
  122. if (needed < 2)
  123. needed = 2;
  124. /* But we need to bound the transaction so we don't overflow the
  125. * journal. */
  126. if (needed > EXT4_MAX_TRANS_DATA)
  127. needed = EXT4_MAX_TRANS_DATA;
  128. return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed;
  129. }
  130. /*
  131. * Truncate transactions can be complex and absolutely huge. So we need to
  132. * be able to restart the transaction at a conventient checkpoint to make
  133. * sure we don't overflow the journal.
  134. *
  135. * start_transaction gets us a new handle for a truncate transaction,
  136. * and extend_transaction tries to extend the existing one a bit. If
  137. * extend fails, we need to propagate the failure up and restart the
  138. * transaction in the top-level truncate loop. --sct
  139. */
  140. static handle_t *start_transaction(struct inode *inode)
  141. {
  142. handle_t *result;
  143. result = ext4_journal_start(inode, blocks_for_truncate(inode));
  144. if (!IS_ERR(result))
  145. return result;
  146. ext4_std_error(inode->i_sb, PTR_ERR(result));
  147. return result;
  148. }
  149. /*
  150. * Try to extend this transaction for the purposes of truncation.
  151. *
  152. * Returns 0 if we managed to create more room. If we can't create more
  153. * room, and the transaction must be restarted we return 1.
  154. */
  155. static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
  156. {
  157. if (!ext4_handle_valid(handle))
  158. return 0;
  159. if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1))
  160. return 0;
  161. if (!ext4_journal_extend(handle, blocks_for_truncate(inode)))
  162. return 0;
  163. return 1;
  164. }
  165. /*
  166. * Restart the transaction associated with *handle. This does a commit,
  167. * so before we call here everything must be consistently dirtied against
  168. * this transaction.
  169. */
  170. static int ext4_journal_test_restart(handle_t *handle, struct inode *inode)
  171. {
  172. BUG_ON(EXT4_JOURNAL(inode) == NULL);
  173. jbd_debug(2, "restarting handle %p\n", handle);
  174. return ext4_journal_restart(handle, blocks_for_truncate(inode));
  175. }
  176. /*
  177. * Called at the last iput() if i_nlink is zero.
  178. */
  179. void ext4_delete_inode(struct inode *inode)
  180. {
  181. handle_t *handle;
  182. int err;
  183. if (ext4_should_order_data(inode))
  184. ext4_begin_ordered_truncate(inode, 0);
  185. truncate_inode_pages(&inode->i_data, 0);
  186. if (is_bad_inode(inode))
  187. goto no_delete;
  188. handle = ext4_journal_start(inode, blocks_for_truncate(inode)+3);
  189. if (IS_ERR(handle)) {
  190. ext4_std_error(inode->i_sb, PTR_ERR(handle));
  191. /*
  192. * If we're going to skip the normal cleanup, we still need to
  193. * make sure that the in-core orphan linked list is properly
  194. * cleaned up.
  195. */
  196. ext4_orphan_del(NULL, inode);
  197. goto no_delete;
  198. }
  199. if (IS_SYNC(inode))
  200. ext4_handle_sync(handle);
  201. inode->i_size = 0;
  202. err = ext4_mark_inode_dirty(handle, inode);
  203. if (err) {
  204. ext4_warning(inode->i_sb, __func__,
  205. "couldn't mark inode dirty (err %d)", err);
  206. goto stop_handle;
  207. }
  208. if (inode->i_blocks)
  209. ext4_truncate(inode);
  210. /*
  211. * ext4_ext_truncate() doesn't reserve any slop when it
  212. * restarts journal transactions; therefore there may not be
  213. * enough credits left in the handle to remove the inode from
  214. * the orphan list and set the dtime field.
  215. */
  216. if (!ext4_handle_has_enough_credits(handle, 3)) {
  217. err = ext4_journal_extend(handle, 3);
  218. if (err > 0)
  219. err = ext4_journal_restart(handle, 3);
  220. if (err != 0) {
  221. ext4_warning(inode->i_sb, __func__,
  222. "couldn't extend journal (err %d)", err);
  223. stop_handle:
  224. ext4_journal_stop(handle);
  225. goto no_delete;
  226. }
  227. }
  228. /*
  229. * Kill off the orphan record which ext4_truncate created.
  230. * AKPM: I think this can be inside the above `if'.
  231. * Note that ext4_orphan_del() has to be able to cope with the
  232. * deletion of a non-existent orphan - this is because we don't
  233. * know if ext4_truncate() actually created an orphan record.
  234. * (Well, we could do this if we need to, but heck - it works)
  235. */
  236. ext4_orphan_del(handle, inode);
  237. EXT4_I(inode)->i_dtime = get_seconds();
  238. /*
  239. * One subtle ordering requirement: if anything has gone wrong
  240. * (transaction abort, IO errors, whatever), then we can still
  241. * do these next steps (the fs will already have been marked as
  242. * having errors), but we can't free the inode if the mark_dirty
  243. * fails.
  244. */
  245. if (ext4_mark_inode_dirty(handle, inode))
  246. /* If that failed, just do the required in-core inode clear. */
  247. clear_inode(inode);
  248. else
  249. ext4_free_inode(handle, inode);
  250. ext4_journal_stop(handle);
  251. return;
  252. no_delete:
  253. clear_inode(inode); /* We must guarantee clearing of inode... */
  254. }
  255. typedef struct {
  256. __le32 *p;
  257. __le32 key;
  258. struct buffer_head *bh;
  259. } Indirect;
  260. static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v)
  261. {
  262. p->key = *(p->p = v);
  263. p->bh = bh;
  264. }
  265. /**
  266. * ext4_block_to_path - parse the block number into array of offsets
  267. * @inode: inode in question (we are only interested in its superblock)
  268. * @i_block: block number to be parsed
  269. * @offsets: array to store the offsets in
  270. * @boundary: set this non-zero if the referred-to block is likely to be
  271. * followed (on disk) by an indirect block.
  272. *
  273. * To store the locations of file's data ext4 uses a data structure common
  274. * for UNIX filesystems - tree of pointers anchored in the inode, with
  275. * data blocks at leaves and indirect blocks in intermediate nodes.
  276. * This function translates the block number into path in that tree -
  277. * return value is the path length and @offsets[n] is the offset of
  278. * pointer to (n+1)th node in the nth one. If @block is out of range
  279. * (negative or too large) warning is printed and zero returned.
  280. *
  281. * Note: function doesn't find node addresses, so no IO is needed. All
  282. * we need to know is the capacity of indirect blocks (taken from the
  283. * inode->i_sb).
  284. */
  285. /*
  286. * Portability note: the last comparison (check that we fit into triple
  287. * indirect block) is spelled differently, because otherwise on an
  288. * architecture with 32-bit longs and 8Kb pages we might get into trouble
  289. * if our filesystem had 8Kb blocks. We might use long long, but that would
  290. * kill us on x86. Oh, well, at least the sign propagation does not matter -
  291. * i_block would have to be negative in the very beginning, so we would not
  292. * get there at all.
  293. */
  294. static int ext4_block_to_path(struct inode *inode,
  295. ext4_lblk_t i_block,
  296. ext4_lblk_t offsets[4], int *boundary)
  297. {
  298. int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb);
  299. int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb);
  300. const long direct_blocks = EXT4_NDIR_BLOCKS,
  301. indirect_blocks = ptrs,
  302. double_blocks = (1 << (ptrs_bits * 2));
  303. int n = 0;
  304. int final = 0;
  305. if (i_block < 0) {
  306. ext4_warning(inode->i_sb, "ext4_block_to_path", "block < 0");
  307. } else if (i_block < direct_blocks) {
  308. offsets[n++] = i_block;
  309. final = direct_blocks;
  310. } else if ((i_block -= direct_blocks) < indirect_blocks) {
  311. offsets[n++] = EXT4_IND_BLOCK;
  312. offsets[n++] = i_block;
  313. final = ptrs;
  314. } else if ((i_block -= indirect_blocks) < double_blocks) {
  315. offsets[n++] = EXT4_DIND_BLOCK;
  316. offsets[n++] = i_block >> ptrs_bits;
  317. offsets[n++] = i_block & (ptrs - 1);
  318. final = ptrs;
  319. } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) {
  320. offsets[n++] = EXT4_TIND_BLOCK;
  321. offsets[n++] = i_block >> (ptrs_bits * 2);
  322. offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1);
  323. offsets[n++] = i_block & (ptrs - 1);
  324. final = ptrs;
  325. } else {
  326. ext4_warning(inode->i_sb, "ext4_block_to_path",
  327. "block %lu > max in inode %lu",
  328. i_block + direct_blocks +
  329. indirect_blocks + double_blocks, inode->i_ino);
  330. }
  331. if (boundary)
  332. *boundary = final - 1 - (i_block & (ptrs - 1));
  333. return n;
  334. }
  335. static int __ext4_check_blockref(const char *function, struct inode *inode,
  336. __le32 *p, unsigned int max)
  337. {
  338. __le32 *bref = p;
  339. unsigned int blk;
  340. while (bref < p+max) {
  341. blk = le32_to_cpu(*bref++);
  342. if (blk &&
  343. unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb),
  344. blk, 1))) {
  345. ext4_error(inode->i_sb, function,
  346. "invalid block reference %u "
  347. "in inode #%lu", blk, inode->i_ino);
  348. return -EIO;
  349. }
  350. }
  351. return 0;
  352. }
  353. #define ext4_check_indirect_blockref(inode, bh) \
  354. __ext4_check_blockref(__func__, inode, (__le32 *)(bh)->b_data, \
  355. EXT4_ADDR_PER_BLOCK((inode)->i_sb))
  356. #define ext4_check_inode_blockref(inode) \
  357. __ext4_check_blockref(__func__, inode, EXT4_I(inode)->i_data, \
  358. EXT4_NDIR_BLOCKS)
  359. /**
  360. * ext4_get_branch - read the chain of indirect blocks leading to data
  361. * @inode: inode in question
  362. * @depth: depth of the chain (1 - direct pointer, etc.)
  363. * @offsets: offsets of pointers in inode/indirect blocks
  364. * @chain: place to store the result
  365. * @err: here we store the error value
  366. *
  367. * Function fills the array of triples <key, p, bh> and returns %NULL
  368. * if everything went OK or the pointer to the last filled triple
  369. * (incomplete one) otherwise. Upon the return chain[i].key contains
  370. * the number of (i+1)-th block in the chain (as it is stored in memory,
  371. * i.e. little-endian 32-bit), chain[i].p contains the address of that
  372. * number (it points into struct inode for i==0 and into the bh->b_data
  373. * for i>0) and chain[i].bh points to the buffer_head of i-th indirect
  374. * block for i>0 and NULL for i==0. In other words, it holds the block
  375. * numbers of the chain, addresses they were taken from (and where we can
  376. * verify that chain did not change) and buffer_heads hosting these
  377. * numbers.
  378. *
  379. * Function stops when it stumbles upon zero pointer (absent block)
  380. * (pointer to last triple returned, *@err == 0)
  381. * or when it gets an IO error reading an indirect block
  382. * (ditto, *@err == -EIO)
  383. * or when it reads all @depth-1 indirect blocks successfully and finds
  384. * the whole chain, all way to the data (returns %NULL, *err == 0).
  385. *
  386. * Need to be called with
  387. * down_read(&EXT4_I(inode)->i_data_sem)
  388. */
  389. static Indirect *ext4_get_branch(struct inode *inode, int depth,
  390. ext4_lblk_t *offsets,
  391. Indirect chain[4], int *err)
  392. {
  393. struct super_block *sb = inode->i_sb;
  394. Indirect *p = chain;
  395. struct buffer_head *bh;
  396. *err = 0;
  397. /* i_data is not going away, no lock needed */
  398. add_chain(chain, NULL, EXT4_I(inode)->i_data + *offsets);
  399. if (!p->key)
  400. goto no_block;
  401. while (--depth) {
  402. bh = sb_getblk(sb, le32_to_cpu(p->key));
  403. if (unlikely(!bh))
  404. goto failure;
  405. if (!bh_uptodate_or_lock(bh)) {
  406. if (bh_submit_read(bh) < 0) {
  407. put_bh(bh);
  408. goto failure;
  409. }
  410. /* validate block references */
  411. if (ext4_check_indirect_blockref(inode, bh)) {
  412. put_bh(bh);
  413. goto failure;
  414. }
  415. }
  416. add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets);
  417. /* Reader: end */
  418. if (!p->key)
  419. goto no_block;
  420. }
  421. return NULL;
  422. failure:
  423. *err = -EIO;
  424. no_block:
  425. return p;
  426. }
  427. /**
  428. * ext4_find_near - find a place for allocation with sufficient locality
  429. * @inode: owner
  430. * @ind: descriptor of indirect block.
  431. *
  432. * This function returns the preferred place for block allocation.
  433. * It is used when heuristic for sequential allocation fails.
  434. * Rules are:
  435. * + if there is a block to the left of our position - allocate near it.
  436. * + if pointer will live in indirect block - allocate near that block.
  437. * + if pointer will live in inode - allocate in the same
  438. * cylinder group.
  439. *
  440. * In the latter case we colour the starting block by the callers PID to
  441. * prevent it from clashing with concurrent allocations for a different inode
  442. * in the same block group. The PID is used here so that functionally related
  443. * files will be close-by on-disk.
  444. *
  445. * Caller must make sure that @ind is valid and will stay that way.
  446. */
  447. static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
  448. {
  449. struct ext4_inode_info *ei = EXT4_I(inode);
  450. __le32 *start = ind->bh ? (__le32 *) ind->bh->b_data : ei->i_data;
  451. __le32 *p;
  452. ext4_fsblk_t bg_start;
  453. ext4_fsblk_t last_block;
  454. ext4_grpblk_t colour;
  455. ext4_group_t block_group;
  456. int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb));
  457. /* Try to find previous block */
  458. for (p = ind->p - 1; p >= start; p--) {
  459. if (*p)
  460. return le32_to_cpu(*p);
  461. }
  462. /* No such thing, so let's try location of indirect block */
  463. if (ind->bh)
  464. return ind->bh->b_blocknr;
  465. /*
  466. * It is going to be referred to from the inode itself? OK, just put it
  467. * into the same cylinder group then.
  468. */
  469. block_group = ei->i_block_group;
  470. if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
  471. block_group &= ~(flex_size-1);
  472. if (S_ISREG(inode->i_mode))
  473. block_group++;
  474. }
  475. bg_start = ext4_group_first_block_no(inode->i_sb, block_group);
  476. last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
  477. /*
  478. * If we are doing delayed allocation, we don't need take
  479. * colour into account.
  480. */
  481. if (test_opt(inode->i_sb, DELALLOC))
  482. return bg_start;
  483. if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
  484. colour = (current->pid % 16) *
  485. (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
  486. else
  487. colour = (current->pid % 16) * ((last_block - bg_start) / 16);
  488. return bg_start + colour;
  489. }
  490. /**
  491. * ext4_find_goal - find a preferred place for allocation.
  492. * @inode: owner
  493. * @block: block we want
  494. * @partial: pointer to the last triple within a chain
  495. *
  496. * Normally this function find the preferred place for block allocation,
  497. * returns it.
  498. */
  499. static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
  500. Indirect *partial)
  501. {
  502. /*
  503. * XXX need to get goal block from mballoc's data structures
  504. */
  505. return ext4_find_near(inode, partial);
  506. }
  507. /**
  508. * ext4_blks_to_allocate: Look up the block map and count the number
  509. * of direct blocks need to be allocated for the given branch.
  510. *
  511. * @branch: chain of indirect blocks
  512. * @k: number of blocks need for indirect blocks
  513. * @blks: number of data blocks to be mapped.
  514. * @blocks_to_boundary: the offset in the indirect block
  515. *
  516. * return the total number of blocks to be allocate, including the
  517. * direct and indirect blocks.
  518. */
  519. static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks,
  520. int blocks_to_boundary)
  521. {
  522. unsigned int count = 0;
  523. /*
  524. * Simple case, [t,d]Indirect block(s) has not allocated yet
  525. * then it's clear blocks on that path have not allocated
  526. */
  527. if (k > 0) {
  528. /* right now we don't handle cross boundary allocation */
  529. if (blks < blocks_to_boundary + 1)
  530. count += blks;
  531. else
  532. count += blocks_to_boundary + 1;
  533. return count;
  534. }
  535. count++;
  536. while (count < blks && count <= blocks_to_boundary &&
  537. le32_to_cpu(*(branch[0].p + count)) == 0) {
  538. count++;
  539. }
  540. return count;
  541. }
  542. /**
  543. * ext4_alloc_blocks: multiple allocate blocks needed for a branch
  544. * @indirect_blks: the number of blocks need to allocate for indirect
  545. * blocks
  546. *
  547. * @new_blocks: on return it will store the new block numbers for
  548. * the indirect blocks(if needed) and the first direct block,
  549. * @blks: on return it will store the total number of allocated
  550. * direct blocks
  551. */
  552. static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
  553. ext4_lblk_t iblock, ext4_fsblk_t goal,
  554. int indirect_blks, int blks,
  555. ext4_fsblk_t new_blocks[4], int *err)
  556. {
  557. struct ext4_allocation_request ar;
  558. int target, i;
  559. unsigned long count = 0, blk_allocated = 0;
  560. int index = 0;
  561. ext4_fsblk_t current_block = 0;
  562. int ret = 0;
  563. /*
  564. * Here we try to allocate the requested multiple blocks at once,
  565. * on a best-effort basis.
  566. * To build a branch, we should allocate blocks for
  567. * the indirect blocks(if not allocated yet), and at least
  568. * the first direct block of this branch. That's the
  569. * minimum number of blocks need to allocate(required)
  570. */
  571. /* first we try to allocate the indirect blocks */
  572. target = indirect_blks;
  573. while (target > 0) {
  574. count = target;
  575. /* allocating blocks for indirect blocks and direct blocks */
  576. current_block = ext4_new_meta_blocks(handle, inode,
  577. goal, &count, err);
  578. if (*err)
  579. goto failed_out;
  580. target -= count;
  581. /* allocate blocks for indirect blocks */
  582. while (index < indirect_blks && count) {
  583. new_blocks[index++] = current_block++;
  584. count--;
  585. }
  586. if (count > 0) {
  587. /*
  588. * save the new block number
  589. * for the first direct block
  590. */
  591. new_blocks[index] = current_block;
  592. printk(KERN_INFO "%s returned more blocks than "
  593. "requested\n", __func__);
  594. WARN_ON(1);
  595. break;
  596. }
  597. }
  598. target = blks - count ;
  599. blk_allocated = count;
  600. if (!target)
  601. goto allocated;
  602. /* Now allocate data blocks */
  603. memset(&ar, 0, sizeof(ar));
  604. ar.inode = inode;
  605. ar.goal = goal;
  606. ar.len = target;
  607. ar.logical = iblock;
  608. if (S_ISREG(inode->i_mode))
  609. /* enable in-core preallocation only for regular files */
  610. ar.flags = EXT4_MB_HINT_DATA;
  611. current_block = ext4_mb_new_blocks(handle, &ar, err);
  612. if (*err && (target == blks)) {
  613. /*
  614. * if the allocation failed and we didn't allocate
  615. * any blocks before
  616. */
  617. goto failed_out;
  618. }
  619. if (!*err) {
  620. if (target == blks) {
  621. /*
  622. * save the new block number
  623. * for the first direct block
  624. */
  625. new_blocks[index] = current_block;
  626. }
  627. blk_allocated += ar.len;
  628. }
  629. allocated:
  630. /* total number of blocks allocated for direct blocks */
  631. ret = blk_allocated;
  632. *err = 0;
  633. return ret;
  634. failed_out:
  635. for (i = 0; i < index; i++)
  636. ext4_free_blocks(handle, inode, new_blocks[i], 1, 0);
  637. return ret;
  638. }
  639. /**
  640. * ext4_alloc_branch - allocate and set up a chain of blocks.
  641. * @inode: owner
  642. * @indirect_blks: number of allocated indirect blocks
  643. * @blks: number of allocated direct blocks
  644. * @offsets: offsets (in the blocks) to store the pointers to next.
  645. * @branch: place to store the chain in.
  646. *
  647. * This function allocates blocks, zeroes out all but the last one,
  648. * links them into chain and (if we are synchronous) writes them to disk.
  649. * In other words, it prepares a branch that can be spliced onto the
  650. * inode. It stores the information about that chain in the branch[], in
  651. * the same format as ext4_get_branch() would do. We are calling it after
  652. * we had read the existing part of chain and partial points to the last
  653. * triple of that (one with zero ->key). Upon the exit we have the same
  654. * picture as after the successful ext4_get_block(), except that in one
  655. * place chain is disconnected - *branch->p is still zero (we did not
  656. * set the last link), but branch->key contains the number that should
  657. * be placed into *branch->p to fill that gap.
  658. *
  659. * If allocation fails we free all blocks we've allocated (and forget
  660. * their buffer_heads) and return the error value the from failed
  661. * ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain
  662. * as described above and return 0.
  663. */
  664. static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
  665. ext4_lblk_t iblock, int indirect_blks,
  666. int *blks, ext4_fsblk_t goal,
  667. ext4_lblk_t *offsets, Indirect *branch)
  668. {
  669. int blocksize = inode->i_sb->s_blocksize;
  670. int i, n = 0;
  671. int err = 0;
  672. struct buffer_head *bh;
  673. int num;
  674. ext4_fsblk_t new_blocks[4];
  675. ext4_fsblk_t current_block;
  676. num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks,
  677. *blks, new_blocks, &err);
  678. if (err)
  679. return err;
  680. branch[0].key = cpu_to_le32(new_blocks[0]);
  681. /*
  682. * metadata blocks and data blocks are allocated.
  683. */
  684. for (n = 1; n <= indirect_blks; n++) {
  685. /*
  686. * Get buffer_head for parent block, zero it out
  687. * and set the pointer to new one, then send
  688. * parent to disk.
  689. */
  690. bh = sb_getblk(inode->i_sb, new_blocks[n-1]);
  691. branch[n].bh = bh;
  692. lock_buffer(bh);
  693. BUFFER_TRACE(bh, "call get_create_access");
  694. err = ext4_journal_get_create_access(handle, bh);
  695. if (err) {
  696. unlock_buffer(bh);
  697. brelse(bh);
  698. goto failed;
  699. }
  700. memset(bh->b_data, 0, blocksize);
  701. branch[n].p = (__le32 *) bh->b_data + offsets[n];
  702. branch[n].key = cpu_to_le32(new_blocks[n]);
  703. *branch[n].p = branch[n].key;
  704. if (n == indirect_blks) {
  705. current_block = new_blocks[n];
  706. /*
  707. * End of chain, update the last new metablock of
  708. * the chain to point to the new allocated
  709. * data blocks numbers
  710. */
  711. for (i = 1; i < num; i++)
  712. *(branch[n].p + i) = cpu_to_le32(++current_block);
  713. }
  714. BUFFER_TRACE(bh, "marking uptodate");
  715. set_buffer_uptodate(bh);
  716. unlock_buffer(bh);
  717. BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
  718. err = ext4_handle_dirty_metadata(handle, inode, bh);
  719. if (err)
  720. goto failed;
  721. }
  722. *blks = num;
  723. return err;
  724. failed:
  725. /* Allocation failed, free what we already allocated */
  726. for (i = 1; i <= n ; i++) {
  727. BUFFER_TRACE(branch[i].bh, "call jbd2_journal_forget");
  728. ext4_journal_forget(handle, branch[i].bh);
  729. }
  730. for (i = 0; i < indirect_blks; i++)
  731. ext4_free_blocks(handle, inode, new_blocks[i], 1, 0);
  732. ext4_free_blocks(handle, inode, new_blocks[i], num, 0);
  733. return err;
  734. }
  735. /**
  736. * ext4_splice_branch - splice the allocated branch onto inode.
  737. * @inode: owner
  738. * @block: (logical) number of block we are adding
  739. * @chain: chain of indirect blocks (with a missing link - see
  740. * ext4_alloc_branch)
  741. * @where: location of missing link
  742. * @num: number of indirect blocks we are adding
  743. * @blks: number of direct blocks we are adding
  744. *
  745. * This function fills the missing link and does all housekeeping needed in
  746. * inode (->i_blocks, etc.). In case of success we end up with the full
  747. * chain to new block and return 0.
  748. */
  749. static int ext4_splice_branch(handle_t *handle, struct inode *inode,
  750. ext4_lblk_t block, Indirect *where, int num,
  751. int blks)
  752. {
  753. int i;
  754. int err = 0;
  755. ext4_fsblk_t current_block;
  756. /*
  757. * If we're splicing into a [td]indirect block (as opposed to the
  758. * inode) then we need to get write access to the [td]indirect block
  759. * before the splice.
  760. */
  761. if (where->bh) {
  762. BUFFER_TRACE(where->bh, "get_write_access");
  763. err = ext4_journal_get_write_access(handle, where->bh);
  764. if (err)
  765. goto err_out;
  766. }
  767. /* That's it */
  768. *where->p = where->key;
  769. /*
  770. * Update the host buffer_head or inode to point to more just allocated
  771. * direct blocks blocks
  772. */
  773. if (num == 0 && blks > 1) {
  774. current_block = le32_to_cpu(where->key) + 1;
  775. for (i = 1; i < blks; i++)
  776. *(where->p + i) = cpu_to_le32(current_block++);
  777. }
  778. /* We are done with atomic stuff, now do the rest of housekeeping */
  779. /* had we spliced it onto indirect block? */
  780. if (where->bh) {
  781. /*
  782. * If we spliced it onto an indirect block, we haven't
  783. * altered the inode. Note however that if it is being spliced
  784. * onto an indirect block at the very end of the file (the
  785. * file is growing) then we *will* alter the inode to reflect
  786. * the new i_size. But that is not done here - it is done in
  787. * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode.
  788. */
  789. jbd_debug(5, "splicing indirect only\n");
  790. BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata");
  791. err = ext4_handle_dirty_metadata(handle, inode, where->bh);
  792. if (err)
  793. goto err_out;
  794. } else {
  795. /*
  796. * OK, we spliced it into the inode itself on a direct block.
  797. */
  798. ext4_mark_inode_dirty(handle, inode);
  799. jbd_debug(5, "splicing direct\n");
  800. }
  801. return err;
  802. err_out:
  803. for (i = 1; i <= num; i++) {
  804. BUFFER_TRACE(where[i].bh, "call jbd2_journal_forget");
  805. ext4_journal_forget(handle, where[i].bh);
  806. ext4_free_blocks(handle, inode,
  807. le32_to_cpu(where[i-1].key), 1, 0);
  808. }
  809. ext4_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks, 0);
  810. return err;
  811. }
  812. /*
  813. * The ext4_ind_get_blocks() function handles non-extents inodes
  814. * (i.e., using the traditional indirect/double-indirect i_blocks
  815. * scheme) for ext4_get_blocks().
  816. *
  817. * Allocation strategy is simple: if we have to allocate something, we will
  818. * have to go the whole way to leaf. So let's do it before attaching anything
  819. * to tree, set linkage between the newborn blocks, write them if sync is
  820. * required, recheck the path, free and repeat if check fails, otherwise
  821. * set the last missing link (that will protect us from any truncate-generated
  822. * removals - all blocks on the path are immune now) and possibly force the
  823. * write on the parent block.
  824. * That has a nice additional property: no special recovery from the failed
  825. * allocations is needed - we simply release blocks and do not touch anything
  826. * reachable from inode.
  827. *
  828. * `handle' can be NULL if create == 0.
  829. *
  830. * return > 0, # of blocks mapped or allocated.
  831. * return = 0, if plain lookup failed.
  832. * return < 0, error case.
  833. *
  834. * The ext4_ind_get_blocks() function should be called with
  835. * down_write(&EXT4_I(inode)->i_data_sem) if allocating filesystem
  836. * blocks (i.e., flags has EXT4_GET_BLOCKS_CREATE set) or
  837. * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system
  838. * blocks.
  839. */
  840. static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
  841. ext4_lblk_t iblock, unsigned int maxblocks,
  842. struct buffer_head *bh_result,
  843. int flags)
  844. {
  845. int err = -EIO;
  846. ext4_lblk_t offsets[4];
  847. Indirect chain[4];
  848. Indirect *partial;
  849. ext4_fsblk_t goal;
  850. int indirect_blks;
  851. int blocks_to_boundary = 0;
  852. int depth;
  853. int count = 0;
  854. ext4_fsblk_t first_block = 0;
  855. J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL));
  856. J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0);
  857. depth = ext4_block_to_path(inode, iblock, offsets,
  858. &blocks_to_boundary);
  859. if (depth == 0)
  860. goto out;
  861. partial = ext4_get_branch(inode, depth, offsets, chain, &err);
  862. /* Simplest case - block found, no allocation needed */
  863. if (!partial) {
  864. first_block = le32_to_cpu(chain[depth - 1].key);
  865. clear_buffer_new(bh_result);
  866. count++;
  867. /*map more blocks*/
  868. while (count < maxblocks && count <= blocks_to_boundary) {
  869. ext4_fsblk_t blk;
  870. blk = le32_to_cpu(*(chain[depth-1].p + count));
  871. if (blk == first_block + count)
  872. count++;
  873. else
  874. break;
  875. }
  876. goto got_it;
  877. }
  878. /* Next simple case - plain lookup or failed read of indirect block */
  879. if ((flags & EXT4_GET_BLOCKS_CREATE) == 0 || err == -EIO)
  880. goto cleanup;
  881. /*
  882. * Okay, we need to do block allocation.
  883. */
  884. goal = ext4_find_goal(inode, iblock, partial);
  885. /* the number of blocks need to allocate for [d,t]indirect blocks */
  886. indirect_blks = (chain + depth) - partial - 1;
  887. /*
  888. * Next look up the indirect map to count the totoal number of
  889. * direct blocks to allocate for this branch.
  890. */
  891. count = ext4_blks_to_allocate(partial, indirect_blks,
  892. maxblocks, blocks_to_boundary);
  893. /*
  894. * Block out ext4_truncate while we alter the tree
  895. */
  896. err = ext4_alloc_branch(handle, inode, iblock, indirect_blks,
  897. &count, goal,
  898. offsets + (partial - chain), partial);
  899. /*
  900. * The ext4_splice_branch call will free and forget any buffers
  901. * on the new chain if there is a failure, but that risks using
  902. * up transaction credits, especially for bitmaps where the
  903. * credits cannot be returned. Can we handle this somehow? We
  904. * may need to return -EAGAIN upwards in the worst case. --sct
  905. */
  906. if (!err)
  907. err = ext4_splice_branch(handle, inode, iblock,
  908. partial, indirect_blks, count);
  909. else
  910. goto cleanup;
  911. set_buffer_new(bh_result);
  912. got_it:
  913. map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
  914. if (count > blocks_to_boundary)
  915. set_buffer_boundary(bh_result);
  916. err = count;
  917. /* Clean up and exit */
  918. partial = chain + depth - 1; /* the whole chain */
  919. cleanup:
  920. while (partial > chain) {
  921. BUFFER_TRACE(partial->bh, "call brelse");
  922. brelse(partial->bh);
  923. partial--;
  924. }
  925. BUFFER_TRACE(bh_result, "returned");
  926. out:
  927. return err;
  928. }
  929. qsize_t ext4_get_reserved_space(struct inode *inode)
  930. {
  931. unsigned long long total;
  932. spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
  933. total = EXT4_I(inode)->i_reserved_data_blocks +
  934. EXT4_I(inode)->i_reserved_meta_blocks;
  935. spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
  936. return total;
  937. }
  938. /*
  939. * Calculate the number of metadata blocks need to reserve
  940. * to allocate @blocks for non extent file based file
  941. */
  942. static int ext4_indirect_calc_metadata_amount(struct inode *inode, int blocks)
  943. {
  944. int icap = EXT4_ADDR_PER_BLOCK(inode->i_sb);
  945. int ind_blks, dind_blks, tind_blks;
  946. /* number of new indirect blocks needed */
  947. ind_blks = (blocks + icap - 1) / icap;
  948. dind_blks = (ind_blks + icap - 1) / icap;
  949. tind_blks = 1;
  950. return ind_blks + dind_blks + tind_blks;
  951. }
  952. /*
  953. * Calculate the number of metadata blocks need to reserve
  954. * to allocate given number of blocks
  955. */
  956. static int ext4_calc_metadata_amount(struct inode *inode, int blocks)
  957. {
  958. if (!blocks)
  959. return 0;
  960. if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
  961. return ext4_ext_calc_metadata_amount(inode, blocks);
  962. return ext4_indirect_calc_metadata_amount(inode, blocks);
  963. }
  964. static void ext4_da_update_reserve_space(struct inode *inode, int used)
  965. {
  966. struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
  967. int total, mdb, mdb_free;
  968. spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
  969. /* recalculate the number of metablocks still need to be reserved */
  970. total = EXT4_I(inode)->i_reserved_data_blocks - used;
  971. mdb = ext4_calc_metadata_amount(inode, total);
  972. /* figure out how many metablocks to release */
  973. BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
  974. mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb;
  975. if (mdb_free) {
  976. /* Account for allocated meta_blocks */
  977. mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks;
  978. /* update fs dirty blocks counter */
  979. percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free);
  980. EXT4_I(inode)->i_allocated_meta_blocks = 0;
  981. EXT4_I(inode)->i_reserved_meta_blocks = mdb;
  982. }
  983. /* update per-inode reservations */
  984. BUG_ON(used > EXT4_I(inode)->i_reserved_data_blocks);
  985. EXT4_I(inode)->i_reserved_data_blocks -= used;
  986. spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
  987. /*
  988. * free those over-booking quota for metadata blocks
  989. */
  990. if (mdb_free)
  991. vfs_dq_release_reservation_block(inode, mdb_free);
  992. /*
  993. * If we have done all the pending block allocations and if
  994. * there aren't any writers on the inode, we can discard the
  995. * inode's preallocations.
  996. */
  997. if (!total && (atomic_read(&inode->i_writecount) == 0))
  998. ext4_discard_preallocations(inode);
  999. }
  1000. static int check_block_validity(struct inode *inode, sector_t logical,
  1001. sector_t phys, int len)
  1002. {
  1003. if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), phys, len)) {
  1004. ext4_error(inode->i_sb, "check_block_validity",
  1005. "inode #%lu logical block %llu mapped to %llu "
  1006. "(size %d)", inode->i_ino,
  1007. (unsigned long long) logical,
  1008. (unsigned long long) phys, len);
  1009. WARN_ON(1);
  1010. return -EIO;
  1011. }
  1012. return 0;
  1013. }
  1014. /*
  1015. * The ext4_get_blocks() function tries to look up the requested blocks,
  1016. * and returns if the blocks are already mapped.
  1017. *
  1018. * Otherwise it takes the write lock of the i_data_sem and allocate blocks
  1019. * and store the allocated blocks in the result buffer head and mark it
  1020. * mapped.
  1021. *
  1022. * If file type is extents based, it will call ext4_ext_get_blocks(),
  1023. * Otherwise, call with ext4_ind_get_blocks() to handle indirect mapping
  1024. * based files
  1025. *
  1026. * On success, it returns the number of blocks being mapped or allocate.
  1027. * if create==0 and the blocks are pre-allocated and uninitialized block,
  1028. * the result buffer head is unmapped. If the create ==1, it will make sure
  1029. * the buffer head is mapped.
  1030. *
  1031. * It returns 0 if plain look up failed (blocks have not been allocated), in
  1032. * that casem, buffer head is unmapped
  1033. *
  1034. * It returns the error in case of allocation failure.
  1035. */
  1036. int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
  1037. unsigned int max_blocks, struct buffer_head *bh,
  1038. int flags)
  1039. {
  1040. int retval;
  1041. clear_buffer_mapped(bh);
  1042. clear_buffer_unwritten(bh);
  1043. /*
  1044. * Try to see if we can get the block without requesting a new
  1045. * file system block.
  1046. */
  1047. down_read((&EXT4_I(inode)->i_data_sem));
  1048. if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
  1049. retval = ext4_ext_get_blocks(handle, inode, block, max_blocks,
  1050. bh, 0);
  1051. } else {
  1052. retval = ext4_ind_get_blocks(handle, inode, block, max_blocks,
  1053. bh, 0);
  1054. }
  1055. up_read((&EXT4_I(inode)->i_data_sem));
  1056. if (retval > 0 && buffer_mapped(bh)) {
  1057. int ret = check_block_validity(inode, block,
  1058. bh->b_blocknr, retval);
  1059. if (ret != 0)
  1060. return ret;
  1061. }
  1062. /* If it is only a block(s) look up */
  1063. if ((flags & EXT4_GET_BLOCKS_CREATE) == 0)
  1064. return retval;
  1065. /*
  1066. * Returns if the blocks have already allocated
  1067. *
  1068. * Note that if blocks have been preallocated
  1069. * ext4_ext_get_block() returns th create = 0
  1070. * with buffer head unmapped.
  1071. */
  1072. if (retval > 0 && buffer_mapped(bh))
  1073. return retval;
  1074. /*
  1075. * When we call get_blocks without the create flag, the
  1076. * BH_Unwritten flag could have gotten set if the blocks
  1077. * requested were part of a uninitialized extent. We need to
  1078. * clear this flag now that we are committed to convert all or
  1079. * part of the uninitialized extent to be an initialized
  1080. * extent. This is because we need to avoid the combination
  1081. * of BH_Unwritten and BH_Mapped flags being simultaneously
  1082. * set on the buffer_head.
  1083. */
  1084. clear_buffer_unwritten(bh);
  1085. /*
  1086. * New blocks allocate and/or writing to uninitialized extent
  1087. * will possibly result in updating i_data, so we take
  1088. * the write lock of i_data_sem, and call get_blocks()
  1089. * with create == 1 flag.
  1090. */
  1091. down_write((&EXT4_I(inode)->i_data_sem));
  1092. /*
  1093. * if the caller is from delayed allocation writeout path
  1094. * we have already reserved fs blocks for allocation
  1095. * let the underlying get_block() function know to
  1096. * avoid double accounting
  1097. */
  1098. if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
  1099. EXT4_I(inode)->i_delalloc_reserved_flag = 1;
  1100. /*
  1101. * We need to check for EXT4 here because migrate
  1102. * could have changed the inode type in between
  1103. */
  1104. if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
  1105. retval = ext4_ext_get_blocks(handle, inode, block, max_blocks,
  1106. bh, flags);
  1107. } else {
  1108. retval = ext4_ind_get_blocks(handle, inode, block,
  1109. max_blocks, bh, flags);
  1110. if (retval > 0 && buffer_new(bh)) {
  1111. /*
  1112. * We allocated new blocks which will result in
  1113. * i_data's format changing. Force the migrate
  1114. * to fail by clearing migrate flags
  1115. */
  1116. EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags &
  1117. ~EXT4_EXT_MIGRATE;
  1118. }
  1119. }
  1120. if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
  1121. EXT4_I(inode)->i_delalloc_reserved_flag = 0;
  1122. /*
  1123. * Update reserved blocks/metadata blocks after successful
  1124. * block allocation which had been deferred till now.
  1125. */
  1126. if ((retval > 0) && (flags & EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE))
  1127. ext4_da_update_reserve_space(inode, retval);
  1128. up_write((&EXT4_I(inode)->i_data_sem));
  1129. if (retval > 0 && buffer_mapped(bh)) {
  1130. int ret = check_block_validity(inode, block,
  1131. bh->b_blocknr, retval);
  1132. if (ret != 0)
  1133. return ret;
  1134. }
  1135. return retval;
  1136. }
  1137. /* Maximum number of blocks we map for direct IO at once. */
  1138. #define DIO_MAX_BLOCKS 4096
  1139. int ext4_get_block(struct inode *inode, sector_t iblock,
  1140. struct buffer_head *bh_result, int create)
  1141. {
  1142. handle_t *handle = ext4_journal_current_handle();
  1143. int ret = 0, started = 0;
  1144. unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
  1145. int dio_credits;
  1146. if (create && !handle) {
  1147. /* Direct IO write... */
  1148. if (max_blocks > DIO_MAX_BLOCKS)
  1149. max_blocks = DIO_MAX_BLOCKS;
  1150. dio_credits = ext4_chunk_trans_blocks(inode, max_blocks);
  1151. handle = ext4_journal_start(inode, dio_credits);
  1152. if (IS_ERR(handle)) {
  1153. ret = PTR_ERR(handle);
  1154. goto out;
  1155. }
  1156. started = 1;
  1157. }
  1158. ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
  1159. create ? EXT4_GET_BLOCKS_CREATE : 0);
  1160. if (ret > 0) {
  1161. bh_result->b_size = (ret << inode->i_blkbits);
  1162. ret = 0;
  1163. }
  1164. if (started)
  1165. ext4_journal_stop(handle);
  1166. out:
  1167. return ret;
  1168. }
  1169. /*
  1170. * `handle' can be NULL if create is zero
  1171. */
  1172. struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
  1173. ext4_lblk_t block, int create, int *errp)
  1174. {
  1175. struct buffer_head dummy;
  1176. int fatal = 0, err;
  1177. int flags = 0;
  1178. J_ASSERT(handle != NULL || create == 0);
  1179. dummy.b_state = 0;
  1180. dummy.b_blocknr = -1000;
  1181. buffer_trace_init(&dummy.b_history);
  1182. if (create)
  1183. flags |= EXT4_GET_BLOCKS_CREATE;
  1184. err = ext4_get_blocks(handle, inode, block, 1, &dummy, flags);
  1185. /*
  1186. * ext4_get_blocks() returns number of blocks mapped. 0 in
  1187. * case of a HOLE.
  1188. */
  1189. if (err > 0) {
  1190. if (err > 1)
  1191. WARN_ON(1);
  1192. err = 0;
  1193. }
  1194. *errp = err;
  1195. if (!err && buffer_mapped(&dummy)) {
  1196. struct buffer_head *bh;
  1197. bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
  1198. if (!bh) {
  1199. *errp = -EIO;
  1200. goto err;
  1201. }
  1202. if (buffer_new(&dummy)) {
  1203. J_ASSERT(create != 0);
  1204. J_ASSERT(handle != NULL);
  1205. /*
  1206. * Now that we do not always journal data, we should
  1207. * keep in mind whether this should always journal the
  1208. * new buffer as metadata. For now, regular file
  1209. * writes use ext4_get_block instead, so it's not a
  1210. * problem.
  1211. */
  1212. lock_buffer(bh);
  1213. BUFFER_TRACE(bh, "call get_create_access");
  1214. fatal = ext4_journal_get_create_access(handle, bh);
  1215. if (!fatal && !buffer_uptodate(bh)) {
  1216. memset(bh->b_data, 0, inode->i_sb->s_blocksize);
  1217. set_buffer_uptodate(bh);
  1218. }
  1219. unlock_buffer(bh);
  1220. BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
  1221. err = ext4_handle_dirty_metadata(handle, inode, bh);
  1222. if (!fatal)
  1223. fatal = err;
  1224. } else {
  1225. BUFFER_TRACE(bh, "not a new buffer");
  1226. }
  1227. if (fatal) {
  1228. *errp = fatal;
  1229. brelse(bh);
  1230. bh = NULL;
  1231. }
  1232. return bh;
  1233. }
  1234. err:
  1235. return NULL;
  1236. }
  1237. struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
  1238. ext4_lblk_t block, int create, int *err)
  1239. {
  1240. struct buffer_head *bh;
  1241. bh = ext4_getblk(handle, inode, block, create, err);
  1242. if (!bh)
  1243. return bh;
  1244. if (buffer_uptodate(bh))
  1245. return bh;
  1246. ll_rw_block(READ_META, 1, &bh);
  1247. wait_on_buffer(bh);
  1248. if (buffer_uptodate(bh))
  1249. return bh;
  1250. put_bh(bh);
  1251. *err = -EIO;
  1252. return NULL;
  1253. }
  1254. static int walk_page_buffers(handle_t *handle,
  1255. struct buffer_head *head,
  1256. unsigned from,
  1257. unsigned to,
  1258. int *partial,
  1259. int (*fn)(handle_t *handle,
  1260. struct buffer_head *bh))
  1261. {
  1262. struct buffer_head *bh;
  1263. unsigned block_start, block_end;
  1264. unsigned blocksize = head->b_size;
  1265. int err, ret = 0;
  1266. struct buffer_head *next;
  1267. for (bh = head, block_start = 0;
  1268. ret == 0 && (bh != head || !block_start);
  1269. block_start = block_end, bh = next) {
  1270. next = bh->b_this_page;
  1271. block_end = block_start + blocksize;
  1272. if (block_end <= from || block_start >= to) {
  1273. if (partial && !buffer_uptodate(bh))
  1274. *partial = 1;
  1275. continue;
  1276. }
  1277. err = (*fn)(handle, bh);
  1278. if (!ret)
  1279. ret = err;
  1280. }
  1281. return ret;
  1282. }
  1283. /*
  1284. * To preserve ordering, it is essential that the hole instantiation and
  1285. * the data write be encapsulated in a single transaction. We cannot
  1286. * close off a transaction and start a new one between the ext4_get_block()
  1287. * and the commit_write(). So doing the jbd2_journal_start at the start of
  1288. * prepare_write() is the right place.
  1289. *
  1290. * Also, this function can nest inside ext4_writepage() ->
  1291. * block_write_full_page(). In that case, we *know* that ext4_writepage()
  1292. * has generated enough buffer credits to do the whole page. So we won't
  1293. * block on the journal in that case, which is good, because the caller may
  1294. * be PF_MEMALLOC.
  1295. *
  1296. * By accident, ext4 can be reentered when a transaction is open via
  1297. * quota file writes. If we were to commit the transaction while thus
  1298. * reentered, there can be a deadlock - we would be holding a quota
  1299. * lock, and the commit would never complete if another thread had a
  1300. * transaction open and was blocking on the quota lock - a ranking
  1301. * violation.
  1302. *
  1303. * So what we do is to rely on the fact that jbd2_journal_stop/journal_start
  1304. * will _not_ run commit under these circumstances because handle->h_ref
  1305. * is elevated. We'll still have enough credits for the tiny quotafile
  1306. * write.
  1307. */
  1308. static int do_journal_get_write_access(handle_t *handle,
  1309. struct buffer_head *bh)
  1310. {
  1311. if (!buffer_mapped(bh) || buffer_freed(bh))
  1312. return 0;
  1313. return ext4_journal_get_write_access(handle, bh);
  1314. }
  1315. static int ext4_write_begin(struct file *file, struct address_space *mapping,
  1316. loff_t pos, unsigned len, unsigned flags,
  1317. struct page **pagep, void **fsdata)
  1318. {
  1319. struct inode *inode = mapping->host;
  1320. int ret, needed_blocks;
  1321. handle_t *handle;
  1322. int retries = 0;
  1323. struct page *page;
  1324. pgoff_t index;
  1325. unsigned from, to;
  1326. trace_ext4_write_begin(inode, pos, len, flags);
  1327. /*
  1328. * Reserve one block more for addition to orphan list in case
  1329. * we allocate blocks but write fails for some reason
  1330. */
  1331. needed_blocks = ext4_writepage_trans_blocks(inode) + 1;
  1332. index = pos >> PAGE_CACHE_SHIFT;
  1333. from = pos & (PAGE_CACHE_SIZE - 1);
  1334. to = from + len;
  1335. retry:
  1336. handle = ext4_journal_start(inode, needed_blocks);
  1337. if (IS_ERR(handle)) {
  1338. ret = PTR_ERR(handle);
  1339. goto out;
  1340. }
  1341. /* We cannot recurse into the filesystem as the transaction is already
  1342. * started */
  1343. flags |= AOP_FLAG_NOFS;
  1344. page = grab_cache_page_write_begin(mapping, index, flags);
  1345. if (!page) {
  1346. ext4_journal_stop(handle);
  1347. ret = -ENOMEM;
  1348. goto out;
  1349. }
  1350. *pagep = page;
  1351. ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
  1352. ext4_get_block);
  1353. if (!ret && ext4_should_journal_data(inode)) {
  1354. ret = walk_page_buffers(handle, page_buffers(page),
  1355. from, to, NULL, do_journal_get_write_access);
  1356. }
  1357. if (ret) {
  1358. unlock_page(page);
  1359. page_cache_release(page);
  1360. /*
  1361. * block_write_begin may have instantiated a few blocks
  1362. * outside i_size. Trim these off again. Don't need
  1363. * i_size_read because we hold i_mutex.
  1364. *
  1365. * Add inode to orphan list in case we crash before
  1366. * truncate finishes
  1367. */
  1368. if (pos + len > inode->i_size && ext4_can_truncate(inode))
  1369. ext4_orphan_add(handle, inode);
  1370. ext4_journal_stop(handle);
  1371. if (pos + len > inode->i_size) {
  1372. ext4_truncate(inode);
  1373. /*
  1374. * If truncate failed early the inode might
  1375. * still be on the orphan list; we need to
  1376. * make sure the inode is removed from the
  1377. * orphan list in that case.
  1378. */
  1379. if (inode->i_nlink)
  1380. ext4_orphan_del(NULL, inode);
  1381. }
  1382. }
  1383. if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
  1384. goto retry;
  1385. out:
  1386. return ret;
  1387. }
  1388. /* For write_end() in data=journal mode */
  1389. static int write_end_fn(handle_t *handle, struct buffer_head *bh)
  1390. {
  1391. if (!buffer_mapped(bh) || buffer_freed(bh))
  1392. return 0;
  1393. set_buffer_uptodate(bh);
  1394. return ext4_handle_dirty_metadata(handle, NULL, bh);
  1395. }
  1396. static int ext4_generic_write_end(struct file *file,
  1397. struct address_space *mapping,
  1398. loff_t pos, unsigned len, unsigned copied,
  1399. struct page *page, void *fsdata)
  1400. {
  1401. int i_size_changed = 0;
  1402. struct inode *inode = mapping->host;
  1403. handle_t *handle = ext4_journal_current_handle();
  1404. copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
  1405. /*
  1406. * No need to use i_size_read() here, the i_size
  1407. * cannot change under us because we hold i_mutex.
  1408. *
  1409. * But it's important to update i_size while still holding page lock:
  1410. * page writeout could otherwise come in and zero beyond i_size.
  1411. */
  1412. if (pos + copied > inode->i_size) {
  1413. i_size_write(inode, pos + copied);
  1414. i_size_changed = 1;
  1415. }
  1416. if (pos + copied > EXT4_I(inode)->i_disksize) {
  1417. /* We need to mark inode dirty even if
  1418. * new_i_size is less that inode->i_size
  1419. * bu greater than i_disksize.(hint delalloc)
  1420. */
  1421. ext4_update_i_disksize(inode, (pos + copied));
  1422. i_size_changed = 1;
  1423. }
  1424. unlock_page(page);
  1425. page_cache_release(page);
  1426. /*
  1427. * Don't mark the inode dirty under page lock. First, it unnecessarily
  1428. * makes the holding time of page lock longer. Second, it forces lock
  1429. * ordering of page lock and transaction start for journaling
  1430. * filesystems.
  1431. */
  1432. if (i_size_changed)
  1433. ext4_mark_inode_dirty(handle, inode);
  1434. return copied;
  1435. }
  1436. /*
  1437. * We need to pick up the new inode size which generic_commit_write gave us
  1438. * `file' can be NULL - eg, when called from page_symlink().
  1439. *
  1440. * ext4 never places buffers on inode->i_mapping->private_list. metadata
  1441. * buffers are managed internally.
  1442. */
  1443. static int ext4_ordered_write_end(struct file *file,
  1444. struct address_space *mapping,
  1445. loff_t pos, unsigned len, unsigned copied,
  1446. struct page *page, void *fsdata)
  1447. {
  1448. handle_t *handle = ext4_journal_current_handle();
  1449. struct inode *inode = mapping->host;
  1450. int ret = 0, ret2;
  1451. trace_ext4_ordered_write_end(inode, pos, len, copied);
  1452. ret = ext4_jbd2_file_inode(handle, inode);
  1453. if (ret == 0) {
  1454. ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
  1455. page, fsdata);
  1456. copied = ret2;
  1457. if (pos + len > inode->i_size && ext4_can_truncate(inode))
  1458. /* if we have allocated more blocks and copied
  1459. * less. We will have blocks allocated outside
  1460. * inode->i_size. So truncate them
  1461. */
  1462. ext4_orphan_add(handle, inode);
  1463. if (ret2 < 0)
  1464. ret = ret2;
  1465. }
  1466. ret2 = ext4_journal_stop(handle);
  1467. if (!ret)
  1468. ret = ret2;
  1469. if (pos + len > inode->i_size) {
  1470. ext4_truncate(inode);
  1471. /*
  1472. * If truncate failed early the inode might still be
  1473. * on the orphan list; we need to make sure the inode
  1474. * is removed from the orphan list in that case.
  1475. */
  1476. if (inode->i_nlink)
  1477. ext4_orphan_del(NULL, inode);
  1478. }
  1479. return ret ? ret : copied;
  1480. }
  1481. static int ext4_writeback_write_end(struct file *file,
  1482. struct address_space *mapping,
  1483. loff_t pos, unsigned len, unsigned copied,
  1484. struct page *page, void *fsdata)
  1485. {
  1486. handle_t *handle = ext4_journal_current_handle();
  1487. struct inode *inode = mapping->host;
  1488. int ret = 0, ret2;
  1489. trace_ext4_writeback_write_end(inode, pos, len, copied);
  1490. ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
  1491. page, fsdata);
  1492. copied = ret2;
  1493. if (pos + len > inode->i_size && ext4_can_truncate(inode))
  1494. /* if we have allocated more blocks and copied
  1495. * less. We will have blocks allocated outside
  1496. * inode->i_size. So truncate them
  1497. */
  1498. ext4_orphan_add(handle, inode);
  1499. if (ret2 < 0)
  1500. ret = ret2;
  1501. ret2 = ext4_journal_stop(handle);
  1502. if (!ret)
  1503. ret = ret2;
  1504. if (pos + len > inode->i_size) {
  1505. ext4_truncate(inode);
  1506. /*
  1507. * If truncate failed early the inode might still be
  1508. * on the orphan list; we need to make sure the inode
  1509. * is removed from the orphan list in that case.
  1510. */
  1511. if (inode->i_nlink)
  1512. ext4_orphan_del(NULL, inode);
  1513. }
  1514. return ret ? ret : copied;
  1515. }
  1516. static int ext4_journalled_write_end(struct file *file,
  1517. struct address_space *mapping,
  1518. loff_t pos, unsigned len, unsigned copied,
  1519. struct page *page