PageRenderTime 53ms CodeModel.GetById 17ms RepoModel.GetById 0ms app.codeStats 1ms

/fs/ext4/mballoc.c

https://github.com/mstsirkin/linux
C | 1776 lines | 1039 code | 187 blank | 550 comment | 223 complexity | 4a441cc9c9686842a9befa525253bec2 MD5 | raw file
  1. /*
  2. * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
  3. * Written by Alex Tomas <alex@clusterfs.com>
  4. *
  5. * This program is free software; you can redistribute it and/or modify
  6. * it under the terms of the GNU General Public License version 2 as
  7. * published by the Free Software Foundation.
  8. *
  9. * This program is distributed in the hope that it will be useful,
  10. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. * GNU General Public License for more details.
  13. *
  14. * You should have received a copy of the GNU General Public Licens
  15. * along with this program; if not, write to the Free Software
  16. * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-
  17. */
  18. /*
  19. * mballoc.c contains the multiblocks allocation routines
  20. */
  21. #include "mballoc.h"
  22. #include <linux/debugfs.h>
  23. #include <linux/slab.h>
  24. #include <trace/events/ext4.h>
  25. /*
  26. * MUSTDO:
  27. * - test ext4_ext_search_left() and ext4_ext_search_right()
  28. * - search for metadata in few groups
  29. *
  30. * TODO v4:
  31. * - normalization should take into account whether file is still open
  32. * - discard preallocations if no free space left (policy?)
  33. * - don't normalize tails
  34. * - quota
  35. * - reservation for superuser
  36. *
  37. * TODO v3:
  38. * - bitmap read-ahead (proposed by Oleg Drokin aka green)
  39. * - track min/max extents in each group for better group selection
  40. * - mb_mark_used() may allocate chunk right after splitting buddy
  41. * - tree of groups sorted by number of free blocks
  42. * - error handling
  43. */
  44. /*
  45. * The allocation request involve request for multiple number of blocks
  46. * near to the goal(block) value specified.
  47. *
  48. * During initialization phase of the allocator we decide to use the
  49. * group preallocation or inode preallocation depending on the size of
  50. * the file. The size of the file could be the resulting file size we
  51. * would have after allocation, or the current file size, which ever
  52. * is larger. If the size is less than sbi->s_mb_stream_request we
  53. * select to use the group preallocation. The default value of
  54. * s_mb_stream_request is 16 blocks. This can also be tuned via
  55. * /sys/fs/ext4/<partition>/mb_stream_req. The value is represented in
  56. * terms of number of blocks.
  57. *
  58. * The main motivation for having small file use group preallocation is to
  59. * ensure that we have small files closer together on the disk.
  60. *
  61. * First stage the allocator looks at the inode prealloc list,
  62. * ext4_inode_info->i_prealloc_list, which contains list of prealloc
  63. * spaces for this particular inode. The inode prealloc space is
  64. * represented as:
  65. *
  66. * pa_lstart -> the logical start block for this prealloc space
  67. * pa_pstart -> the physical start block for this prealloc space
  68. * pa_len -> length for this prealloc space
  69. * pa_free -> free space available in this prealloc space
  70. *
  71. * The inode preallocation space is used looking at the _logical_ start
  72. * block. If only the logical file block falls within the range of prealloc
  73. * space we will consume the particular prealloc space. This makes sure that
  74. * we have contiguous physical blocks representing the file blocks
  75. *
  76. * The important thing to be noted in case of inode prealloc space is that
  77. * we don't modify the values associated to inode prealloc space except
  78. * pa_free.
  79. *
  80. * If we are not able to find blocks in the inode prealloc space and if we
  81. * have the group allocation flag set then we look at the locality group
  82. * prealloc space. These are per CPU prealloc list represented as
  83. *
  84. * ext4_sb_info.s_locality_groups[smp_processor_id()]
  85. *
  86. * The reason for having a per cpu locality group is to reduce the contention
  87. * between CPUs. It is possible to get scheduled at this point.
  88. *
  89. * The locality group prealloc space is used looking at whether we have
  90. * enough free space (pa_free) within the prealloc space.
  91. *
  92. * If we can't allocate blocks via inode prealloc or/and locality group
  93. * prealloc then we look at the buddy cache. The buddy cache is represented
  94. * by ext4_sb_info.s_buddy_cache (struct inode) whose file offset gets
  95. * mapped to the buddy and bitmap information regarding different
  96. * groups. The buddy information is attached to buddy cache inode so that
  97. * we can access them through the page cache. The information regarding
  98. * each group is loaded via ext4_mb_load_buddy. The information involve
  99. * block bitmap and buddy information. The information are stored in the
  100. * inode as:
  101. *
  102. * { page }
  103. * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
  104. *
  105. *
  106. * one block each for bitmap and buddy information. So for each group we
  107. * take up 2 blocks. A page can contain blocks_per_page (PAGE_CACHE_SIZE /
  108. * blocksize) blocks. So it can have information regarding groups_per_page
  109. * which is blocks_per_page/2
  110. *
  111. * The buddy cache inode is not stored on disk. The inode is thrown
  112. * away when the filesystem is unmounted.
  113. *
  114. * We look for count number of blocks in the buddy cache. If we were able
  115. * to locate that many free blocks we return with additional information
  116. * regarding rest of the contiguous physical block available
  117. *
  118. * Before allocating blocks via buddy cache we normalize the request
  119. * blocks. This ensure we ask for more blocks that we needed. The extra
  120. * blocks that we get after allocation is added to the respective prealloc
  121. * list. In case of inode preallocation we follow a list of heuristics
  122. * based on file size. This can be found in ext4_mb_normalize_request. If
  123. * we are doing a group prealloc we try to normalize the request to
  124. * sbi->s_mb_group_prealloc. Default value of s_mb_group_prealloc is
  125. * 512 blocks. This can be tuned via
  126. * /sys/fs/ext4/<partition>/mb_group_prealloc. The value is represented in
  127. * terms of number of blocks. If we have mounted the file system with -O
  128. * stripe=<value> option the group prealloc request is normalized to the
  129. * the smallest multiple of the stripe value (sbi->s_stripe) which is
  130. * greater than the default mb_group_prealloc.
  131. *
  132. * The regular allocator (using the buddy cache) supports a few tunables.
  133. *
  134. * /sys/fs/ext4/<partition>/mb_min_to_scan
  135. * /sys/fs/ext4/<partition>/mb_max_to_scan
  136. * /sys/fs/ext4/<partition>/mb_order2_req
  137. *
  138. * The regular allocator uses buddy scan only if the request len is power of
  139. * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The
  140. * value of s_mb_order2_reqs can be tuned via
  141. * /sys/fs/ext4/<partition>/mb_order2_req. If the request len is equal to
  142. * stripe size (sbi->s_stripe), we try to search for contiguous block in
  143. * stripe size. This should result in better allocation on RAID setups. If
  144. * not, we search in the specific group using bitmap for best extents. The
  145. * tunable min_to_scan and max_to_scan control the behaviour here.
  146. * min_to_scan indicate how long the mballoc __must__ look for a best
  147. * extent and max_to_scan indicates how long the mballoc __can__ look for a
  148. * best extent in the found extents. Searching for the blocks starts with
  149. * the group specified as the goal value in allocation context via
  150. * ac_g_ex. Each group is first checked based on the criteria whether it
  151. * can be used for allocation. ext4_mb_good_group explains how the groups are
  152. * checked.
  153. *
  154. * Both the prealloc space are getting populated as above. So for the first
  155. * request we will hit the buddy cache which will result in this prealloc
  156. * space getting filled. The prealloc space is then later used for the
  157. * subsequent request.
  158. */
  159. /*
  160. * mballoc operates on the following data:
  161. * - on-disk bitmap
  162. * - in-core buddy (actually includes buddy and bitmap)
  163. * - preallocation descriptors (PAs)
  164. *
  165. * there are two types of preallocations:
  166. * - inode
  167. * assiged to specific inode and can be used for this inode only.
  168. * it describes part of inode's space preallocated to specific
  169. * physical blocks. any block from that preallocated can be used
  170. * independent. the descriptor just tracks number of blocks left
  171. * unused. so, before taking some block from descriptor, one must
  172. * make sure corresponded logical block isn't allocated yet. this
  173. * also means that freeing any block within descriptor's range
  174. * must discard all preallocated blocks.
  175. * - locality group
  176. * assigned to specific locality group which does not translate to
  177. * permanent set of inodes: inode can join and leave group. space
  178. * from this type of preallocation can be used for any inode. thus
  179. * it's consumed from the beginning to the end.
  180. *
  181. * relation between them can be expressed as:
  182. * in-core buddy = on-disk bitmap + preallocation descriptors
  183. *
  184. * this mean blocks mballoc considers used are:
  185. * - allocated blocks (persistent)
  186. * - preallocated blocks (non-persistent)
  187. *
  188. * consistency in mballoc world means that at any time a block is either
  189. * free or used in ALL structures. notice: "any time" should not be read
  190. * literally -- time is discrete and delimited by locks.
  191. *
  192. * to keep it simple, we don't use block numbers, instead we count number of
  193. * blocks: how many blocks marked used/free in on-disk bitmap, buddy and PA.
  194. *
  195. * all operations can be expressed as:
  196. * - init buddy: buddy = on-disk + PAs
  197. * - new PA: buddy += N; PA = N
  198. * - use inode PA: on-disk += N; PA -= N
  199. * - discard inode PA buddy -= on-disk - PA; PA = 0
  200. * - use locality group PA on-disk += N; PA -= N
  201. * - discard locality group PA buddy -= PA; PA = 0
  202. * note: 'buddy -= on-disk - PA' is used to show that on-disk bitmap
  203. * is used in real operation because we can't know actual used
  204. * bits from PA, only from on-disk bitmap
  205. *
  206. * if we follow this strict logic, then all operations above should be atomic.
  207. * given some of them can block, we'd have to use something like semaphores
  208. * killing performance on high-end SMP hardware. let's try to relax it using
  209. * the following knowledge:
  210. * 1) if buddy is referenced, it's already initialized
  211. * 2) while block is used in buddy and the buddy is referenced,
  212. * nobody can re-allocate that block
  213. * 3) we work on bitmaps and '+' actually means 'set bits'. if on-disk has
  214. * bit set and PA claims same block, it's OK. IOW, one can set bit in
  215. * on-disk bitmap if buddy has same bit set or/and PA covers corresponded
  216. * block
  217. *
  218. * so, now we're building a concurrency table:
  219. * - init buddy vs.
  220. * - new PA
  221. * blocks for PA are allocated in the buddy, buddy must be referenced
  222. * until PA is linked to allocation group to avoid concurrent buddy init
  223. * - use inode PA
  224. * we need to make sure that either on-disk bitmap or PA has uptodate data
  225. * given (3) we care that PA-=N operation doesn't interfere with init
  226. * - discard inode PA
  227. * the simplest way would be to have buddy initialized by the discard
  228. * - use locality group PA
  229. * again PA-=N must be serialized with init
  230. * - discard locality group PA
  231. * the simplest way would be to have buddy initialized by the discard
  232. * - new PA vs.
  233. * - use inode PA
  234. * i_data_sem serializes them
  235. * - discard inode PA
  236. * discard process must wait until PA isn't used by another process
  237. * - use locality group PA
  238. * some mutex should serialize them
  239. * - discard locality group PA
  240. * discard process must wait until PA isn't used by another process
  241. * - use inode PA
  242. * - use inode PA
  243. * i_data_sem or another mutex should serializes them
  244. * - discard inode PA
  245. * discard process must wait until PA isn't used by another process
  246. * - use locality group PA
  247. * nothing wrong here -- they're different PAs covering different blocks
  248. * - discard locality group PA
  249. * discard process must wait until PA isn't used by another process
  250. *
  251. * now we're ready to make few consequences:
  252. * - PA is referenced and while it is no discard is possible
  253. * - PA is referenced until block isn't marked in on-disk bitmap
  254. * - PA changes only after on-disk bitmap
  255. * - discard must not compete with init. either init is done before
  256. * any discard or they're serialized somehow
  257. * - buddy init as sum of on-disk bitmap and PAs is done atomically
  258. *
  259. * a special case when we've used PA to emptiness. no need to modify buddy
  260. * in this case, but we should care about concurrent init
  261. *
  262. */
  263. /*
  264. * Logic in few words:
  265. *
  266. * - allocation:
  267. * load group
  268. * find blocks
  269. * mark bits in on-disk bitmap
  270. * release group
  271. *
  272. * - use preallocation:
  273. * find proper PA (per-inode or group)
  274. * load group
  275. * mark bits in on-disk bitmap
  276. * release group
  277. * release PA
  278. *
  279. * - free:
  280. * load group
  281. * mark bits in on-disk bitmap
  282. * release group
  283. *
  284. * - discard preallocations in group:
  285. * mark PAs deleted
  286. * move them onto local list
  287. * load on-disk bitmap
  288. * load group
  289. * remove PA from object (inode or locality group)
  290. * mark free blocks in-core
  291. *
  292. * - discard inode's preallocations:
  293. */
  294. /*
  295. * Locking rules
  296. *
  297. * Locks:
  298. * - bitlock on a group (group)
  299. * - object (inode/locality) (object)
  300. * - per-pa lock (pa)
  301. *
  302. * Paths:
  303. * - new pa
  304. * object
  305. * group
  306. *
  307. * - find and use pa:
  308. * pa
  309. *
  310. * - release consumed pa:
  311. * pa
  312. * group
  313. * object
  314. *
  315. * - generate in-core bitmap:
  316. * group
  317. * pa
  318. *
  319. * - discard all for given object (inode, locality group):
  320. * object
  321. * pa
  322. * group
  323. *
  324. * - discard all for given group:
  325. * group
  326. * pa
  327. * group
  328. * object
  329. *
  330. */
  331. static struct kmem_cache *ext4_pspace_cachep;
  332. static struct kmem_cache *ext4_ac_cachep;
  333. static struct kmem_cache *ext4_free_ext_cachep;
  334. /* We create slab caches for groupinfo data structures based on the
  335. * superblock block size. There will be one per mounted filesystem for
  336. * each unique s_blocksize_bits */
  337. #define NR_GRPINFO_CACHES 8
  338. static struct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES];
  339. static const char *ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = {
  340. "ext4_groupinfo_1k", "ext4_groupinfo_2k", "ext4_groupinfo_4k",
  341. "ext4_groupinfo_8k", "ext4_groupinfo_16k", "ext4_groupinfo_32k",
  342. "ext4_groupinfo_64k", "ext4_groupinfo_128k"
  343. };
  344. static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
  345. ext4_group_t group);
  346. static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
  347. ext4_group_t group);
  348. static void release_blocks_on_commit(journal_t *journal, transaction_t *txn);
  349. static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
  350. {
  351. #if BITS_PER_LONG == 64
  352. *bit += ((unsigned long) addr & 7UL) << 3;
  353. addr = (void *) ((unsigned long) addr & ~7UL);
  354. #elif BITS_PER_LONG == 32
  355. *bit += ((unsigned long) addr & 3UL) << 3;
  356. addr = (void *) ((unsigned long) addr & ~3UL);
  357. #else
  358. #error "how many bits you are?!"
  359. #endif
  360. return addr;
  361. }
  362. static inline int mb_test_bit(int bit, void *addr)
  363. {
  364. /*
  365. * ext4_test_bit on architecture like powerpc
  366. * needs unsigned long aligned address
  367. */
  368. addr = mb_correct_addr_and_bit(&bit, addr);
  369. return ext4_test_bit(bit, addr);
  370. }
  371. static inline void mb_set_bit(int bit, void *addr)
  372. {
  373. addr = mb_correct_addr_and_bit(&bit, addr);
  374. ext4_set_bit(bit, addr);
  375. }
  376. static inline void mb_clear_bit(int bit, void *addr)
  377. {
  378. addr = mb_correct_addr_and_bit(&bit, addr);
  379. ext4_clear_bit(bit, addr);
  380. }
  381. static inline int mb_find_next_zero_bit(void *addr, int max, int start)
  382. {
  383. int fix = 0, ret, tmpmax;
  384. addr = mb_correct_addr_and_bit(&fix, addr);
  385. tmpmax = max + fix;
  386. start += fix;
  387. ret = ext4_find_next_zero_bit(addr, tmpmax, start) - fix;
  388. if (ret > max)
  389. return max;
  390. return ret;
  391. }
  392. static inline int mb_find_next_bit(void *addr, int max, int start)
  393. {
  394. int fix = 0, ret, tmpmax;
  395. addr = mb_correct_addr_and_bit(&fix, addr);
  396. tmpmax = max + fix;
  397. start += fix;
  398. ret = ext4_find_next_bit(addr, tmpmax, start) - fix;
  399. if (ret > max)
  400. return max;
  401. return ret;
  402. }
  403. static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
  404. {
  405. char *bb;
  406. BUG_ON(EXT4_MB_BITMAP(e4b) == EXT4_MB_BUDDY(e4b));
  407. BUG_ON(max == NULL);
  408. if (order > e4b->bd_blkbits + 1) {
  409. *max = 0;
  410. return NULL;
  411. }
  412. /* at order 0 we see each particular block */
  413. if (order == 0) {
  414. *max = 1 << (e4b->bd_blkbits + 3);
  415. return EXT4_MB_BITMAP(e4b);
  416. }
  417. bb = EXT4_MB_BUDDY(e4b) + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order];
  418. *max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order];
  419. return bb;
  420. }
  421. #ifdef DOUBLE_CHECK
  422. static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
  423. int first, int count)
  424. {
  425. int i;
  426. struct super_block *sb = e4b->bd_sb;
  427. if (unlikely(e4b->bd_info->bb_bitmap == NULL))
  428. return;
  429. assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
  430. for (i = 0; i < count; i++) {
  431. if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) {
  432. ext4_fsblk_t blocknr;
  433. blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
  434. blocknr += first + i;
  435. ext4_grp_locked_error(sb, e4b->bd_group,
  436. inode ? inode->i_ino : 0,
  437. blocknr,
  438. "freeing block already freed "
  439. "(bit %u)",
  440. first + i);
  441. }
  442. mb_clear_bit(first + i, e4b->bd_info->bb_bitmap);
  443. }
  444. }
  445. static void mb_mark_used_double(struct ext4_buddy *e4b, int first, int count)
  446. {
  447. int i;
  448. if (unlikely(e4b->bd_info->bb_bitmap == NULL))
  449. return;
  450. assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
  451. for (i = 0; i < count; i++) {
  452. BUG_ON(mb_test_bit(first + i, e4b->bd_info->bb_bitmap));
  453. mb_set_bit(first + i, e4b->bd_info->bb_bitmap);
  454. }
  455. }
  456. static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
  457. {
  458. if (memcmp(e4b->bd_info->bb_bitmap, bitmap, e4b->bd_sb->s_blocksize)) {
  459. unsigned char *b1, *b2;
  460. int i;
  461. b1 = (unsigned char *) e4b->bd_info->bb_bitmap;
  462. b2 = (unsigned char *) bitmap;
  463. for (i = 0; i < e4b->bd_sb->s_blocksize; i++) {
  464. if (b1[i] != b2[i]) {
  465. ext4_msg(e4b->bd_sb, KERN_ERR,
  466. "corruption in group %u "
  467. "at byte %u(%u): %x in copy != %x "
  468. "on disk/prealloc",
  469. e4b->bd_group, i, i * 8, b1[i], b2[i]);
  470. BUG();
  471. }
  472. }
  473. }
  474. }
  475. #else
  476. static inline void mb_free_blocks_double(struct inode *inode,
  477. struct ext4_buddy *e4b, int first, int count)
  478. {
  479. return;
  480. }
  481. static inline void mb_mark_used_double(struct ext4_buddy *e4b,
  482. int first, int count)
  483. {
  484. return;
  485. }
  486. static inline void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
  487. {
  488. return;
  489. }
  490. #endif
  491. #ifdef AGGRESSIVE_CHECK
  492. #define MB_CHECK_ASSERT(assert) \
  493. do { \
  494. if (!(assert)) { \
  495. printk(KERN_EMERG \
  496. "Assertion failure in %s() at %s:%d: \"%s\"\n", \
  497. function, file, line, # assert); \
  498. BUG(); \
  499. } \
  500. } while (0)
  501. static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
  502. const char *function, int line)
  503. {
  504. struct super_block *sb = e4b->bd_sb;
  505. int order = e4b->bd_blkbits + 1;
  506. int max;
  507. int max2;
  508. int i;
  509. int j;
  510. int k;
  511. int count;
  512. struct ext4_group_info *grp;
  513. int fragments = 0;
  514. int fstart;
  515. struct list_head *cur;
  516. void *buddy;
  517. void *buddy2;
  518. {
  519. static int mb_check_counter;
  520. if (mb_check_counter++ % 100 != 0)
  521. return 0;
  522. }
  523. while (order > 1) {
  524. buddy = mb_find_buddy(e4b, order, &max);
  525. MB_CHECK_ASSERT(buddy);
  526. buddy2 = mb_find_buddy(e4b, order - 1, &max2);
  527. MB_CHECK_ASSERT(buddy2);
  528. MB_CHECK_ASSERT(buddy != buddy2);
  529. MB_CHECK_ASSERT(max * 2 == max2);
  530. count = 0;
  531. for (i = 0; i < max; i++) {
  532. if (mb_test_bit(i, buddy)) {
  533. /* only single bit in buddy2 may be 1 */
  534. if (!mb_test_bit(i << 1, buddy2)) {
  535. MB_CHECK_ASSERT(
  536. mb_test_bit((i<<1)+1, buddy2));
  537. } else if (!mb_test_bit((i << 1) + 1, buddy2)) {
  538. MB_CHECK_ASSERT(
  539. mb_test_bit(i << 1, buddy2));
  540. }
  541. continue;
  542. }
  543. /* both bits in buddy2 must be 0 */
  544. MB_CHECK_ASSERT(mb_test_bit(i << 1, buddy2));
  545. MB_CHECK_ASSERT(mb_test_bit((i << 1) + 1, buddy2));
  546. for (j = 0; j < (1 << order); j++) {
  547. k = (i * (1 << order)) + j;
  548. MB_CHECK_ASSERT(
  549. !mb_test_bit(k, EXT4_MB_BITMAP(e4b)));
  550. }
  551. count++;
  552. }
  553. MB_CHECK_ASSERT(e4b->bd_info->bb_counters[order] == count);
  554. order--;
  555. }
  556. fstart = -1;
  557. buddy = mb_find_buddy(e4b, 0, &max);
  558. for (i = 0; i < max; i++) {
  559. if (!mb_test_bit(i, buddy)) {
  560. MB_CHECK_ASSERT(i >= e4b->bd_info->bb_first_free);
  561. if (fstart == -1) {
  562. fragments++;
  563. fstart = i;
  564. }
  565. continue;
  566. }
  567. fstart = -1;
  568. /* check used bits only */
  569. for (j = 0; j < e4b->bd_blkbits + 1; j++) {
  570. buddy2 = mb_find_buddy(e4b, j, &max2);
  571. k = i >> j;
  572. MB_CHECK_ASSERT(k < max2);
  573. MB_CHECK_ASSERT(mb_test_bit(k, buddy2));
  574. }
  575. }
  576. MB_CHECK_ASSERT(!EXT4_MB_GRP_NEED_INIT(e4b->bd_info));
  577. MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments);
  578. grp = ext4_get_group_info(sb, e4b->bd_group);
  579. list_for_each(cur, &grp->bb_prealloc_list) {
  580. ext4_group_t groupnr;
  581. struct ext4_prealloc_space *pa;
  582. pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
  583. ext4_get_group_no_and_offset(sb, pa->pa_pstart, &groupnr, &k);
  584. MB_CHECK_ASSERT(groupnr == e4b->bd_group);
  585. for (i = 0; i < pa->pa_len; i++)
  586. MB_CHECK_ASSERT(mb_test_bit(k + i, buddy));
  587. }
  588. return 0;
  589. }
  590. #undef MB_CHECK_ASSERT
  591. #define mb_check_buddy(e4b) __mb_check_buddy(e4b, \
  592. __FILE__, __func__, __LINE__)
  593. #else
  594. #define mb_check_buddy(e4b)
  595. #endif
  596. /*
  597. * Divide blocks started from @first with length @len into
  598. * smaller chunks with power of 2 blocks.
  599. * Clear the bits in bitmap which the blocks of the chunk(s) covered,
  600. * then increase bb_counters[] for corresponded chunk size.
  601. */
  602. static void ext4_mb_mark_free_simple(struct super_block *sb,
  603. void *buddy, ext4_grpblk_t first, ext4_grpblk_t len,
  604. struct ext4_group_info *grp)
  605. {
  606. struct ext4_sb_info *sbi = EXT4_SB(sb);
  607. ext4_grpblk_t min;
  608. ext4_grpblk_t max;
  609. ext4_grpblk_t chunk;
  610. unsigned short border;
  611. BUG_ON(len > EXT4_BLOCKS_PER_GROUP(sb));
  612. border = 2 << sb->s_blocksize_bits;
  613. while (len > 0) {
  614. /* find how many blocks can be covered since this position */
  615. max = ffs(first | border) - 1;
  616. /* find how many blocks of power 2 we need to mark */
  617. min = fls(len) - 1;
  618. if (max < min)
  619. min = max;
  620. chunk = 1 << min;
  621. /* mark multiblock chunks only */
  622. grp->bb_counters[min]++;
  623. if (min > 0)
  624. mb_clear_bit(first >> min,
  625. buddy + sbi->s_mb_offsets[min]);
  626. len -= chunk;
  627. first += chunk;
  628. }
  629. }
  630. /*
  631. * Cache the order of the largest free extent we have available in this block
  632. * group.
  633. */
  634. static void
  635. mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp)
  636. {
  637. int i;
  638. int bits;
  639. grp->bb_largest_free_order = -1; /* uninit */
  640. bits = sb->s_blocksize_bits + 1;
  641. for (i = bits; i >= 0; i--) {
  642. if (grp->bb_counters[i] > 0) {
  643. grp->bb_largest_free_order = i;
  644. break;
  645. }
  646. }
  647. }
  648. static noinline_for_stack
  649. void ext4_mb_generate_buddy(struct super_block *sb,
  650. void *buddy, void *bitmap, ext4_group_t group)
  651. {
  652. struct ext4_group_info *grp = ext4_get_group_info(sb, group);
  653. ext4_grpblk_t max = EXT4_BLOCKS_PER_GROUP(sb);
  654. ext4_grpblk_t i = 0;
  655. ext4_grpblk_t first;
  656. ext4_grpblk_t len;
  657. unsigned free = 0;
  658. unsigned fragments = 0;
  659. unsigned long long period = get_cycles();
  660. /* initialize buddy from bitmap which is aggregation
  661. * of on-disk bitmap and preallocations */
  662. i = mb_find_next_zero_bit(bitmap, max, 0);
  663. grp->bb_first_free = i;
  664. while (i < max) {
  665. fragments++;
  666. first = i;
  667. i = mb_find_next_bit(bitmap, max, i);
  668. len = i - first;
  669. free += len;
  670. if (len > 1)
  671. ext4_mb_mark_free_simple(sb, buddy, first, len, grp);
  672. else
  673. grp->bb_counters[0]++;
  674. if (i < max)
  675. i = mb_find_next_zero_bit(bitmap, max, i);
  676. }
  677. grp->bb_fragments = fragments;
  678. if (free != grp->bb_free) {
  679. ext4_grp_locked_error(sb, group, 0, 0,
  680. "%u blocks in bitmap, %u in gd",
  681. free, grp->bb_free);
  682. /*
  683. * If we intent to continue, we consider group descritor
  684. * corrupt and update bb_free using bitmap value
  685. */
  686. grp->bb_free = free;
  687. }
  688. mb_set_largest_free_order(sb, grp);
  689. clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
  690. period = get_cycles() - period;
  691. spin_lock(&EXT4_SB(sb)->s_bal_lock);
  692. EXT4_SB(sb)->s_mb_buddies_generated++;
  693. EXT4_SB(sb)->s_mb_generation_time += period;
  694. spin_unlock(&EXT4_SB(sb)->s_bal_lock);
  695. }
  696. /* The buddy information is attached the buddy cache inode
  697. * for convenience. The information regarding each group
  698. * is loaded via ext4_mb_load_buddy. The information involve
  699. * block bitmap and buddy information. The information are
  700. * stored in the inode as
  701. *
  702. * { page }
  703. * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
  704. *
  705. *
  706. * one block each for bitmap and buddy information.
  707. * So for each group we take up 2 blocks. A page can
  708. * contain blocks_per_page (PAGE_CACHE_SIZE / blocksize) blocks.
  709. * So it can have information regarding groups_per_page which
  710. * is blocks_per_page/2
  711. *
  712. * Locking note: This routine takes the block group lock of all groups
  713. * for this page; do not hold this lock when calling this routine!
  714. */
  715. static int ext4_mb_init_cache(struct page *page, char *incore)
  716. {
  717. ext4_group_t ngroups;
  718. int blocksize;
  719. int blocks_per_page;
  720. int groups_per_page;
  721. int err = 0;
  722. int i;
  723. ext4_group_t first_group;
  724. int first_block;
  725. struct super_block *sb;
  726. struct buffer_head *bhs;
  727. struct buffer_head **bh;
  728. struct inode *inode;
  729. char *data;
  730. char *bitmap;
  731. struct ext4_group_info *grinfo;
  732. mb_debug(1, "init page %lu\n", page->index);
  733. inode = page->mapping->host;
  734. sb = inode->i_sb;
  735. ngroups = ext4_get_groups_count(sb);
  736. blocksize = 1 << inode->i_blkbits;
  737. blocks_per_page = PAGE_CACHE_SIZE / blocksize;
  738. groups_per_page = blocks_per_page >> 1;
  739. if (groups_per_page == 0)
  740. groups_per_page = 1;
  741. /* allocate buffer_heads to read bitmaps */
  742. if (groups_per_page > 1) {
  743. err = -ENOMEM;
  744. i = sizeof(struct buffer_head *) * groups_per_page;
  745. bh = kzalloc(i, GFP_NOFS);
  746. if (bh == NULL)
  747. goto out;
  748. } else
  749. bh = &bhs;
  750. first_group = page->index * blocks_per_page / 2;
  751. /* read all groups the page covers into the cache */
  752. for (i = 0; i < groups_per_page; i++) {
  753. struct ext4_group_desc *desc;
  754. if (first_group + i >= ngroups)
  755. break;
  756. grinfo = ext4_get_group_info(sb, first_group + i);
  757. /*
  758. * If page is uptodate then we came here after online resize
  759. * which added some new uninitialized group info structs, so
  760. * we must skip all initialized uptodate buddies on the page,
  761. * which may be currently in use by an allocating task.
  762. */
  763. if (PageUptodate(page) && !EXT4_MB_GRP_NEED_INIT(grinfo)) {
  764. bh[i] = NULL;
  765. continue;
  766. }
  767. err = -EIO;
  768. desc = ext4_get_group_desc(sb, first_group + i, NULL);
  769. if (desc == NULL)
  770. goto out;
  771. err = -ENOMEM;
  772. bh[i] = sb_getblk(sb, ext4_block_bitmap(sb, desc));
  773. if (bh[i] == NULL)
  774. goto out;
  775. if (bitmap_uptodate(bh[i]))
  776. continue;
  777. lock_buffer(bh[i]);
  778. if (bitmap_uptodate(bh[i])) {
  779. unlock_buffer(bh[i]);
  780. continue;
  781. }
  782. ext4_lock_group(sb, first_group + i);
  783. if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
  784. ext4_init_block_bitmap(sb, bh[i],
  785. first_group + i, desc);
  786. set_bitmap_uptodate(bh[i]);
  787. set_buffer_uptodate(bh[i]);
  788. ext4_unlock_group(sb, first_group + i);
  789. unlock_buffer(bh[i]);
  790. continue;
  791. }
  792. ext4_unlock_group(sb, first_group + i);
  793. if (buffer_uptodate(bh[i])) {
  794. /*
  795. * if not uninit if bh is uptodate,
  796. * bitmap is also uptodate
  797. */
  798. set_bitmap_uptodate(bh[i]);
  799. unlock_buffer(bh[i]);
  800. continue;
  801. }
  802. get_bh(bh[i]);
  803. /*
  804. * submit the buffer_head for read. We can
  805. * safely mark the bitmap as uptodate now.
  806. * We do it here so the bitmap uptodate bit
  807. * get set with buffer lock held.
  808. */
  809. set_bitmap_uptodate(bh[i]);
  810. bh[i]->b_end_io = end_buffer_read_sync;
  811. submit_bh(READ, bh[i]);
  812. mb_debug(1, "read bitmap for group %u\n", first_group + i);
  813. }
  814. /* wait for I/O completion */
  815. for (i = 0; i < groups_per_page; i++)
  816. if (bh[i])
  817. wait_on_buffer(bh[i]);
  818. err = -EIO;
  819. for (i = 0; i < groups_per_page; i++)
  820. if (bh[i] && !buffer_uptodate(bh[i]))
  821. goto out;
  822. err = 0;
  823. first_block = page->index * blocks_per_page;
  824. for (i = 0; i < blocks_per_page; i++) {
  825. int group;
  826. group = (first_block + i) >> 1;
  827. if (group >= ngroups)
  828. break;
  829. if (!bh[group - first_group])
  830. /* skip initialized uptodate buddy */
  831. continue;
  832. /*
  833. * data carry information regarding this
  834. * particular group in the format specified
  835. * above
  836. *
  837. */
  838. data = page_address(page) + (i * blocksize);
  839. bitmap = bh[group - first_group]->b_data;
  840. /*
  841. * We place the buddy block and bitmap block
  842. * close together
  843. */
  844. if ((first_block + i) & 1) {
  845. /* this is block of buddy */
  846. BUG_ON(incore == NULL);
  847. mb_debug(1, "put buddy for group %u in page %lu/%x\n",
  848. group, page->index, i * blocksize);
  849. trace_ext4_mb_buddy_bitmap_load(sb, group);
  850. grinfo = ext4_get_group_info(sb, group);
  851. grinfo->bb_fragments = 0;
  852. memset(grinfo->bb_counters, 0,
  853. sizeof(*grinfo->bb_counters) *
  854. (sb->s_blocksize_bits+2));
  855. /*
  856. * incore got set to the group block bitmap below
  857. */
  858. ext4_lock_group(sb, group);
  859. /* init the buddy */
  860. memset(data, 0xff, blocksize);
  861. ext4_mb_generate_buddy(sb, data, incore, group);
  862. ext4_unlock_group(sb, group);
  863. incore = NULL;
  864. } else {
  865. /* this is block of bitmap */
  866. BUG_ON(incore != NULL);
  867. mb_debug(1, "put bitmap for group %u in page %lu/%x\n",
  868. group, page->index, i * blocksize);
  869. trace_ext4_mb_bitmap_load(sb, group);
  870. /* see comments in ext4_mb_put_pa() */
  871. ext4_lock_group(sb, group);
  872. memcpy(data, bitmap, blocksize);
  873. /* mark all preallocated blks used in in-core bitmap */
  874. ext4_mb_generate_from_pa(sb, data, group);
  875. ext4_mb_generate_from_freelist(sb, data, group);
  876. ext4_unlock_group(sb, group);
  877. /* set incore so that the buddy information can be
  878. * generated using this
  879. */
  880. incore = data;
  881. }
  882. }
  883. SetPageUptodate(page);
  884. out:
  885. if (bh) {
  886. for (i = 0; i < groups_per_page; i++)
  887. brelse(bh[i]);
  888. if (bh != &bhs)
  889. kfree(bh);
  890. }
  891. return err;
  892. }
  893. /*
  894. * Lock the buddy and bitmap pages. This make sure other parallel init_group
  895. * on the same buddy page doesn't happen whild holding the buddy page lock.
  896. * Return locked buddy and bitmap pages on e4b struct. If buddy and bitmap
  897. * are on the same page e4b->bd_buddy_page is NULL and return value is 0.
  898. */
  899. static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
  900. ext4_group_t group, struct ext4_buddy *e4b)
  901. {
  902. struct inode *inode = EXT4_SB(sb)->s_buddy_cache;
  903. int block, pnum, poff;
  904. int blocks_per_page;
  905. struct page *page;
  906. e4b->bd_buddy_page = NULL;
  907. e4b->bd_bitmap_page = NULL;
  908. blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
  909. /*
  910. * the buddy cache inode stores the block bitmap
  911. * and buddy information in consecutive blocks.
  912. * So for each group we need two blocks.
  913. */
  914. block = group * 2;
  915. pnum = block / blocks_per_page;
  916. poff = block % blocks_per_page;
  917. page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
  918. if (!page)
  919. return -EIO;
  920. BUG_ON(page->mapping != inode->i_mapping);
  921. e4b->bd_bitmap_page = page;
  922. e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
  923. if (blocks_per_page >= 2) {
  924. /* buddy and bitmap are on the same page */
  925. return 0;
  926. }
  927. block++;
  928. pnum = block / blocks_per_page;
  929. poff = block % blocks_per_page;
  930. page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
  931. if (!page)
  932. return -EIO;
  933. BUG_ON(page->mapping != inode->i_mapping);
  934. e4b->bd_buddy_page = page;
  935. return 0;
  936. }
  937. static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b)
  938. {
  939. if (e4b->bd_bitmap_page) {
  940. unlock_page(e4b->bd_bitmap_page);
  941. page_cache_release(e4b->bd_bitmap_page);
  942. }
  943. if (e4b->bd_buddy_page) {
  944. unlock_page(e4b->bd_buddy_page);
  945. page_cache_release(e4b->bd_buddy_page);
  946. }
  947. }
  948. /*
  949. * Locking note: This routine calls ext4_mb_init_cache(), which takes the
  950. * block group lock of all groups for this page; do not hold the BG lock when
  951. * calling this routine!
  952. */
  953. static noinline_for_stack
  954. int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
  955. {
  956. struct ext4_group_info *this_grp;
  957. struct ext4_buddy e4b;
  958. struct page *page;
  959. int ret = 0;
  960. mb_debug(1, "init group %u\n", group);
  961. this_grp = ext4_get_group_info(sb, group);
  962. /*
  963. * This ensures that we don't reinit the buddy cache
  964. * page which map to the group from which we are already
  965. * allocating. If we are looking at the buddy cache we would
  966. * have taken a reference using ext4_mb_load_buddy and that
  967. * would have pinned buddy page to page cache.
  968. */
  969. ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b);
  970. if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) {
  971. /*
  972. * somebody initialized the group
  973. * return without doing anything
  974. */
  975. goto err;
  976. }
  977. page = e4b.bd_bitmap_page;
  978. ret = ext4_mb_init_cache(page, NULL);
  979. if (ret)
  980. goto err;
  981. if (!PageUptodate(page)) {
  982. ret = -EIO;
  983. goto err;
  984. }
  985. mark_page_accessed(page);
  986. if (e4b.bd_buddy_page == NULL) {
  987. /*
  988. * If both the bitmap and buddy are in
  989. * the same page we don't need to force
  990. * init the buddy
  991. */
  992. ret = 0;
  993. goto err;
  994. }
  995. /* init buddy cache */
  996. page = e4b.bd_buddy_page;
  997. ret = ext4_mb_init_cache(page, e4b.bd_bitmap);
  998. if (ret)
  999. goto err;
  1000. if (!PageUptodate(page)) {
  1001. ret = -EIO;
  1002. goto err;
  1003. }
  1004. mark_page_accessed(page);
  1005. err:
  1006. ext4_mb_put_buddy_page_lock(&e4b);
  1007. return ret;
  1008. }
  1009. /*
  1010. * Locking note: This routine calls ext4_mb_init_cache(), which takes the
  1011. * block group lock of all groups for this page; do not hold the BG lock when
  1012. * calling this routine!
  1013. */
  1014. static noinline_for_stack int
  1015. ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
  1016. struct ext4_buddy *e4b)
  1017. {
  1018. int blocks_per_page;
  1019. int block;
  1020. int pnum;
  1021. int poff;
  1022. struct page *page;
  1023. int ret;
  1024. struct ext4_group_info *grp;
  1025. struct ext4_sb_info *sbi = EXT4_SB(sb);
  1026. struct inode *inode = sbi->s_buddy_cache;
  1027. mb_debug(1, "load group %u\n", group);
  1028. blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
  1029. grp = ext4_get_group_info(sb, group);
  1030. e4b->bd_blkbits = sb->s_blocksize_bits;
  1031. e4b->bd_info = grp;
  1032. e4b->bd_sb = sb;
  1033. e4b->bd_group = group;
  1034. e4b->bd_buddy_page = NULL;
  1035. e4b->bd_bitmap_page = NULL;
  1036. if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
  1037. /*
  1038. * we need full data about the group
  1039. * to make a good selection
  1040. */
  1041. ret = ext4_mb_init_group(sb, group);
  1042. if (ret)
  1043. return ret;
  1044. }
  1045. /*
  1046. * the buddy cache inode stores the block bitmap
  1047. * and buddy information in consecutive blocks.
  1048. * So for each group we need two blocks.
  1049. */
  1050. block = group * 2;
  1051. pnum = block / blocks_per_page;
  1052. poff = block % blocks_per_page;
  1053. /* we could use find_or_create_page(), but it locks page
  1054. * what we'd like to avoid in fast path ... */
  1055. page = find_get_page(inode->i_mapping, pnum);
  1056. if (page == NULL || !PageUptodate(page)) {
  1057. if (page)
  1058. /*
  1059. * drop the page reference and try
  1060. * to get the page with lock. If we
  1061. * are not uptodate that implies
  1062. * somebody just created the page but
  1063. * is yet to initialize the same. So
  1064. * wait for it to initialize.
  1065. */
  1066. page_cache_release(page);
  1067. page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
  1068. if (page) {
  1069. BUG_ON(page->mapping != inode->i_mapping);
  1070. if (!PageUptodate(page)) {
  1071. ret = ext4_mb_init_cache(page, NULL);
  1072. if (ret) {
  1073. unlock_page(page);
  1074. goto err;
  1075. }
  1076. mb_cmp_bitmaps(e4b, page_address(page) +
  1077. (poff * sb->s_blocksize));
  1078. }
  1079. unlock_page(page);
  1080. }
  1081. }
  1082. if (page == NULL || !PageUptodate(page)) {
  1083. ret = -EIO;
  1084. goto err;
  1085. }
  1086. e4b->bd_bitmap_page = page;
  1087. e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
  1088. mark_page_accessed(page);
  1089. block++;
  1090. pnum = block / blocks_per_page;
  1091. poff = block % blocks_per_page;
  1092. page = find_get_page(inode->i_mapping, pnum);
  1093. if (page == NULL || !PageUptodate(page)) {
  1094. if (page)
  1095. page_cache_release(page);
  1096. page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
  1097. if (page) {
  1098. BUG_ON(page->mapping != inode->i_mapping);
  1099. if (!PageUptodate(page)) {
  1100. ret = ext4_mb_init_cache(page, e4b->bd_bitmap);
  1101. if (ret) {
  1102. unlock_page(page);
  1103. goto err;
  1104. }
  1105. }
  1106. unlock_page(page);
  1107. }
  1108. }
  1109. if (page == NULL || !PageUptodate(page)) {
  1110. ret = -EIO;
  1111. goto err;
  1112. }
  1113. e4b->bd_buddy_page = page;
  1114. e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize);
  1115. mark_page_accessed(page);
  1116. BUG_ON(e4b->bd_bitmap_page == NULL);
  1117. BUG_ON(e4b->bd_buddy_page == NULL);
  1118. return 0;
  1119. err:
  1120. if (page)
  1121. page_cache_release(page);
  1122. if (e4b->bd_bitmap_page)
  1123. page_cache_release(e4b->bd_bitmap_page);
  1124. if (e4b->bd_buddy_page)
  1125. page_cache_release(e4b->bd_buddy_page);
  1126. e4b->bd_buddy = NULL;
  1127. e4b->bd_bitmap = NULL;
  1128. return ret;
  1129. }
  1130. static void ext4_mb_unload_buddy(struct ext4_buddy *e4b)
  1131. {
  1132. if (e4b->bd_bitmap_page)
  1133. page_cache_release(e4b->bd_bitmap_page);
  1134. if (e4b->bd_buddy_page)
  1135. page_cache_release(e4b->bd_buddy_page);
  1136. }
  1137. static int mb_find_order_for_block(struct ext4_buddy *e4b, int block)
  1138. {
  1139. int order = 1;
  1140. void *bb;
  1141. BUG_ON(EXT4_MB_BITMAP(e4b) == EXT4_MB_BUDDY(e4b));
  1142. BUG_ON(block >= (1 << (e4b->bd_blkbits + 3)));
  1143. bb = EXT4_MB_BUDDY(e4b);
  1144. while (order <= e4b->bd_blkbits + 1) {
  1145. block = block >> 1;
  1146. if (!mb_test_bit(block, bb)) {
  1147. /* this block is part of buddy of order 'order' */
  1148. return order;
  1149. }
  1150. bb += 1 << (e4b->bd_blkbits - order);
  1151. order++;
  1152. }
  1153. return 0;
  1154. }
  1155. static void mb_clear_bits(void *bm, int cur, int len)
  1156. {
  1157. __u32 *addr;
  1158. len = cur + len;
  1159. while (cur < len) {
  1160. if ((cur & 31) == 0 && (len - cur) >= 32) {
  1161. /* fast path: clear whole word at once */
  1162. addr = bm + (cur >> 3);
  1163. *addr = 0;
  1164. cur += 32;
  1165. continue;
  1166. }
  1167. mb_clear_bit(cur, bm);
  1168. cur++;
  1169. }
  1170. }
  1171. void ext4_set_bits(void *bm, int cur, int len)
  1172. {
  1173. __u32 *addr;
  1174. len = cur + len;
  1175. while (cur < len) {
  1176. if ((cur & 31) == 0 && (len - cur) >= 32) {
  1177. /* fast path: set whole word at once */
  1178. addr = bm + (cur >> 3);
  1179. *addr = 0xffffffff;
  1180. cur += 32;
  1181. continue;
  1182. }
  1183. mb_set_bit(cur, bm);
  1184. cur++;
  1185. }
  1186. }
  1187. static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
  1188. int first, int count)
  1189. {
  1190. int block = 0;
  1191. int max = 0;
  1192. int order;
  1193. void *buddy;
  1194. void *buddy2;
  1195. struct super_block *sb = e4b->bd_sb;
  1196. BUG_ON(first + count > (sb->s_blocksize << 3));
  1197. assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
  1198. mb_check_buddy(e4b);
  1199. mb_free_blocks_double(inode, e4b, first, count);
  1200. e4b->bd_info->bb_free += count;
  1201. if (first < e4b->bd_info->bb_first_free)
  1202. e4b->bd_info->bb_first_free = first;
  1203. /* let's maintain fragments counter */
  1204. if (first != 0)
  1205. block = !mb_test_bit(first - 1, EXT4_MB_BITMAP(e4b));
  1206. if (first + count < EXT4_SB(sb)->s_mb_maxs[0])
  1207. max = !mb_test_bit(first + count, EXT4_MB_BITMAP(e4b));
  1208. if (block && max)
  1209. e4b->bd_info->bb_fragments--;
  1210. else if (!block && !max)
  1211. e4b->bd_info->bb_fragments++;
  1212. /* let's maintain buddy itself */
  1213. while (count-- > 0) {
  1214. block = first++;
  1215. order = 0;
  1216. if (!mb_test_bit(block, EXT4_MB_BITMAP(e4b))) {
  1217. ext4_fsblk_t blocknr;
  1218. blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
  1219. blocknr += block;
  1220. ext4_grp_locked_error(sb, e4b->bd_group,
  1221. inode ? inode->i_ino : 0,
  1222. blocknr,
  1223. "freeing already freed block "
  1224. "(bit %u)", block);
  1225. }
  1226. mb_clear_bit(block, EXT4_MB_BITMAP(e4b));
  1227. e4b->bd_info->bb_counters[order]++;
  1228. /* start of the buddy */
  1229. buddy = mb_find_buddy(e4b, order, &max);
  1230. do {
  1231. block &= ~1UL;
  1232. if (mb_test_bit(block, buddy) ||
  1233. mb_test_bit(block + 1, buddy))
  1234. break;
  1235. /* both the buddies are free, try to coalesce them */
  1236. buddy2 = mb_find_buddy(e4b, order + 1, &max);
  1237. if (!buddy2)
  1238. break;
  1239. if (order > 0) {
  1240. /* for special purposes, we don't set
  1241. * free bits in bitmap */
  1242. mb_set_bit(block, buddy);
  1243. mb_set_bit(block + 1, buddy);
  1244. }
  1245. e4b->bd_info->bb_counters[order]--;
  1246. e4b->bd_info->bb_counters[order]--;
  1247. block = block >> 1;
  1248. order++;
  1249. e4b->bd_info->bb_counters[order]++;
  1250. mb_clear_bit(block, buddy2);
  1251. buddy = buddy2;
  1252. } while (1);
  1253. }
  1254. mb_set_largest_free_order(sb, e4b->bd_info);
  1255. mb_check_buddy(e4b);
  1256. }
  1257. static int mb_find_extent(struct ext4_buddy *e4b, int order, int block,
  1258. int needed, struct ext4_free_extent *ex)
  1259. {
  1260. int next = block;
  1261. int max;
  1262. int ord;
  1263. void *buddy;
  1264. assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
  1265. BUG_ON(ex == NULL);
  1266. buddy = mb_find_buddy(e4b, order, &max);
  1267. BUG_ON(buddy == NULL);
  1268. BUG_ON(block >= max);
  1269. if (mb_test_bit(block, buddy)) {
  1270. ex->fe_len = 0;
  1271. ex->fe_start = 0;
  1272. ex->fe_group = 0;
  1273. return 0;
  1274. }
  1275. /* FIXME dorp order completely ? */
  1276. if (likely(order == 0)) {
  1277. /* find actual order */
  1278. order = mb_find_order_for_block(e4b, block);
  1279. block = block >> order;
  1280. }
  1281. ex->fe_len = 1 << order;
  1282. ex->fe_start = block << order;
  1283. ex->fe_group = e4b->bd_group;
  1284. /* calc difference from given start */
  1285. next = next - ex->fe_start;
  1286. ex->fe_len -= next;
  1287. ex->fe_start += next;
  1288. while (needed > ex->fe_len &&
  1289. (buddy = mb_find_buddy(e4b, order, &max))) {
  1290. if (block + 1 >= max)
  1291. break;
  1292. next = (block + 1) * (1 << order);
  1293. if (mb_test_bit(next, EXT4_MB_BITMAP(e4b)))
  1294. break;
  1295. ord = mb_find_order_for_block(e4b, next);
  1296. order = ord;
  1297. block = next >> order;
  1298. ex->fe_len += 1 << order;
  1299. }
  1300. BUG_ON(ex->fe_start + ex->fe_len > (1 << (e4b->bd_blkbits + 3)));
  1301. return ex->fe_len;
  1302. }
  1303. static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
  1304. {
  1305. int ord;
  1306. int mlen = 0;
  1307. int max = 0;
  1308. int cur;
  1309. int start = ex->fe_start;
  1310. int len = ex->fe_len;
  1311. unsigned ret = 0;
  1312. int len0 = len;
  1313. void *buddy;
  1314. BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3));
  1315. BUG_ON(e4b->bd_group != ex->fe_group);
  1316. assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
  1317. mb_check_buddy(e4b);
  1318. mb_mark_used_double(e4b, start, len);
  1319. e4b->bd_info->bb_free -= len;
  1320. if (e4b->bd_info->bb_first_free == start)
  1321. e4b->bd_info->bb_first_free += len;
  1322. /* let's maintain fragments counter */
  1323. if (start != 0)
  1324. mlen = !mb_test_bit(start - 1, EXT4_MB_BITMAP(e4b));
  1325. if (start + len < EXT4_SB(e4b->bd_sb)->s_mb_maxs[0])
  1326. max = !mb_test_bit(start + len, EXT4_MB_BITMAP(e4b));
  1327. if (mlen && max)
  1328. e4b->bd_info->bb_fragments++;
  1329. else if (!mlen && !max)
  1330. e4b->bd_info->bb_fragments--;
  1331. /* let's maintain buddy itself */
  1332. while (len) {
  1333. ord = mb_find_order_for_block(e4b, start);
  1334. if (((start >> ord) << ord) == start && len >= (1 << ord)) {
  1335. /* the whole chunk may be allocated at once! */
  1336. mlen = 1 << ord;
  1337. buddy = mb_find_buddy(e4b, ord, &max);
  1338. BUG_ON((start >> ord) >= max);
  1339. mb_set_bit(start >> ord, buddy);
  1340. e4b->bd_info->bb_counters[ord]--;
  1341. start += mlen;
  1342. len -= mlen;
  1343. BUG_ON(len < 0);
  1344. continue;
  1345. }
  1346. /* store for history */
  1347. if (ret == 0)
  1348. ret = len | (ord << 16);
  1349. /* we have to split large buddy */
  1350. BUG_ON(ord <= 0);
  1351. buddy = mb_find_buddy(e4b, ord, &max);
  1352. mb_set_bit(start >> ord, buddy);
  1353. e4b->bd_info->bb_counters[ord]--;
  1354. ord--;
  1355. cur = (start >> ord) & ~1U;
  1356. buddy = mb_find_buddy(e4b, ord, &max);
  1357. mb_clear_bit(cur, buddy);
  1358. mb_clear_bit(cur + 1, buddy);
  1359. e4b->bd_info->bb_counters[ord]++;
  1360. e4b->bd_info->bb_counters[ord]++;
  1361. }
  1362. mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info);
  1363. ext4_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0);
  1364. mb_check_buddy(e4b);
  1365. return ret;
  1366. }
  1367. /*
  1368. * Must be called under group lock!
  1369. */
  1370. static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
  1371. struct ext4_buddy *e4b)
  1372. {
  1373. struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
  1374. int ret;
  1375. BUG_ON(ac->ac_b_ex.fe_group != e4b->bd_group);
  1376. BUG_ON(ac->ac_status == AC_STATUS_FOUND);
  1377. ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len);
  1378. ac->ac_b_ex.fe_logical = ac->ac_g_ex.fe_logical;
  1379. ret = mb_mark_used(e4b, &ac->ac_b_ex);
  1380. /* preallocation can change ac_b_ex, thus we store actually
  1381. * allocated blocks for history */
  1382. ac->ac_f_ex = ac->ac_b_ex;
  1383. ac->ac_status = AC_STATUS_FOUND;
  1384. ac->ac_tail = ret & 0xffff;
  1385. ac->ac_buddy = ret >> 16;
  1386. /*
  1387. * take the page reference. We want the page to be pinned
  1388. * so that we don't get a ext4_mb_init_cache_call for this
  1389. * group until we update the bitmap. That would mean we
  1390. * double allocate blocks. The reference is dropped
  1391. * in ext4_mb_release_context
  1392. */
  1393. ac->ac_bitmap_page = e4b->bd_bitmap_page;
  1394. get_page(ac->ac_bitmap_page);
  1395. ac->ac_buddy_page = e4b->bd_buddy_page;
  1396. get_page(ac->ac_buddy_page);
  1397. /* store last allocated for subsequent stream allocation */
  1398. if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
  1399. spin_lock(&sbi->s_md_lock);
  1400. sbi->s_mb_last_group = ac->ac_f_ex.fe_group;
  1401. sbi->s_mb_last_start = ac->ac_f_ex.fe_start;
  1402. spin_unlock(&sbi->s_md_lock);
  1403. }
  1404. }
  1405. /*
  1406. * regular allocator, for general purposes allocation
  1407. */
  1408. static void ext4_mb_check_limits(struct ext4_allocation_context *ac,
  1409. struct ext4_buddy *e4b,
  1410. int finish_group)
  1411. {
  1412. struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
  1413. struct ext4_free_extent *bex = &ac->ac_b_ex;
  1414. struct ext4_free_extent *gex = &ac->ac_g_ex;
  1415. struct ext4_free_extent ex;
  1416. int max;
  1417. if (ac->ac_status == AC_STATUS_FOUND)
  1418. return;
  1419. /*
  1420. * We don't want to scan for a whole year
  1421. */
  1422. if (ac->ac_found > sbi->s_mb_max_to_scan &&
  1423. !(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
  1424. ac->ac_status = AC_STATUS_BREAK;
  1425. return;
  1426. }
  1427. /*
  1428. * Haven't found good chunk so far, let's continue
  1429. */
  1430. if (bex->fe_len < gex->fe_len)
  1431. return;
  1432. if ((finish_group || ac->ac_found > sbi->s_mb_min_to_scan)
  1433. && bex->fe_group == e4b->bd_group) {
  1434. /* recheck chunk's availability - we don't know
  1435. * when it was found (within this lock-unlock
  1436. * period or not) */
  1437. max = mb_find_extent(e4b, 0, bex->fe_start, gex->fe_len, &ex);
  1438. if (max >= gex->fe_len) {
  1439. ext4_mb_use_best_found(ac, e4b);
  1440. return;
  1441. }
  1442. }
  1443. }
  1444. /*
  1445. * The routine checks whether found extent is good enough. If it is,
  1446. * then the extent gets marked used and flag is set to the context
  1447. * to stop scanning. Otherwise, the extent is compared with the
  1448. * previous found extent and if new one is better, then it's stored
  1449. * in the context. Later, the best found extent will be used, if
  1450. * mballoc can't find good enough extent.
  1451. *
  1452. * FIXME: real allocation policy is to be designed yet!
  1453. */
  1454. static void ext4_mb_measure_extent(struct ext4_allocation_context *ac,
  1455. struct ext4_free_extent *ex,
  1456. struct ext4_buddy *e4b)
  1457. {
  1458. struct ext4_free_extent *bex = &ac->ac_b_ex;
  1459. struct ext4_free_extent *gex = &ac->ac_g_ex;
  1460. BUG_ON(ex->fe_len <= 0);
  1461. BUG_ON(ex->fe_len > EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
  1462. BUG_ON(ex->fe_start >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
  1463. BUG_ON(ac->ac_status != AC_STATUS_CONTINUE);
  1464. ac->ac_found++;
  1465. /*
  1466. * The special case - take what you catch first
  1467. */
  1468. if (unlikely(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
  1469. *bex = *ex;
  1470. ext4_mb_use_best_found(ac, e4b);
  1471. return;
  1472. }
  1473. /*
  1474. * Let's check whether the chuck is good enough
  1475. */
  1476. if (ex->fe_len == gex->fe_len) {
  1477. *bex = *ex;
  1478. ext4_mb_use_best_found(ac, e4b);
  1479. return;
  1480. }
  1481. /*
  1482. * If this is first found extent, just store it in the context
  1483. */
  1484. if (bex->fe_len == 0) {
  1485. *bex = *ex;
  1486. return;
  1487. }
  1488. /*
  1489. * If new found extent is better, store it in the context
  1490. */
  1491. if (bex->fe_len < gex->fe_len) {
  1492. /* if the request isn't satisfied, any found extent
  1493. * larger than previous best one is better */
  1494. if (ex->fe_len > bex->fe_len)
  1495. *bex = *ex;
  1496. } else if (ex->fe_len > gex->fe_len) {
  1497. /* if the request is satisfied, then we try to find
  1498. * an extent that still satisfy the request, but is
  1499. * smaller than previous one */
  1500. if (ex->fe_len < bex->fe_len)
  1501. *bex = *ex;
  1502. }
  1503. ext4_mb_check_limits(ac, e4b, 0);
  1504. }
  1505. static noinline_for_stack
  1506. int ext4_mb_try_best_found(struct ext4_allocation_context *ac,
  1507. struct ext4_buddy *e4b)
  1508. {
  1509. struct ext4_free_extent ex = ac->ac_b_ex;
  1510. ext4_group_t group = ex.fe_group;
  1511. int max;
  1512. int err;
  1513. BUG_ON(ex.fe_len <= 0);
  1514. err = ext4_mb_load_buddy(ac->ac_sb, group, e4b);
  1515. if (err)
  1516. return err;
  1517. ext4_lock_group(ac->ac_sb, group);
  1518. max = mb_find_extent(e4b, 0, ex.fe_start, ex.fe_len, &ex);
  1519. if (max > 0) {
  1520. ac->ac_b_ex = ex;
  1521. ext4_mb_use_best_found(ac, e4b);
  1522. }
  1523. ext4_unlock_group(ac->ac_sb, group);
  1524. ext4_mb_unload_buddy(e4b);
  1525. return 0;
  1526. }
  1527. static noinline_for_stack
  1528. int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
  1529. struct ext4_buddy *e4b)
  1530. {
  1531. ext4_group_t group = ac->ac_g_ex.fe_group;
  1532. int max;
  1533. int err;
  1534. struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
  1535. struct ext4_free_extent ex;
  1536. if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL))
  1537. return 0;
  1538. err = ext4_mb_load_buddy(ac->ac_sb, group, e4b);
  1539. if (err)
  1540. return err;
  1541. ext4_lock_group(ac->ac_sb, group);
  1542. max = mb_find_extent(e4b, 0, ac->ac_g_ex.fe_start,
  1543. ac->ac_g_ex.fe_len, &ex);
  1544. if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) {
  1545. ext4_fsblk_t start;
  1546. start = ext4_group_first_block_no(ac->ac_sb, e4b->bd_group) +
  1547. ex.fe_start;
  1548. /* use do_div to get remainder (would be 64-bit modulo) */
  1549. if (do_div(start, sbi->s_stripe) == 0) {
  1550. ac->ac_found++;
  1551. ac->ac_b_ex = ex;
  1552. ext4_mb_use_best_found(ac, e4b);
  1553. }
  1554. } else if (max >= ac->ac_g_ex.fe_len) {
  1555. BUG_ON(ex.fe_len <= 0);
  1556. BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group);
  1557. BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start);
  1558. ac->ac_found++;
  1559. ac->ac_b_ex = ex;
  1560. ext4_mb_use_best_found(ac, e4b);
  1561. } else if (max > 0 && (ac->ac_flags & EXT4_MB_HINT_MERGE)) {
  1562. /* Sometimes, caller may want to merge even small
  1563. * number of blocks to an existing extent */
  1564. BUG_ON(ex.fe_len <= 0);
  1565. BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group);
  1566. BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start);
  1567. ac->ac_found++;
  1568. ac->ac_b_ex = ex;
  1569. ext4_mb_use_best_found(ac, e4b);
  1570. }
  1571. ext4_unlock_group(ac->ac_sb, group);
  1572. ext4_mb_unload_buddy(e4b);
  1573. return 0;
  1574. }
  1575. /*
  1576. * The routine scans buddy structures (not bitmap!) from given order
  1577. * to max order and tries to find big enough chunk to satisfy the req
  1578. */
  1579. static noinline_for_stack
  1580. void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac,
  1581. struct ext4_buddy *e4b)
  1582. {
  1583. struct super_block *sb = ac->ac_sb;
  1584. struct ext4_group_info *grp = e4b->bd_info;
  1585. void *buddy;
  1586. int i;
  1587. int k;
  1588. int max;
  1589. BUG_ON