PageRenderTime 61ms CodeModel.GetById 32ms RepoModel.GetById 1ms app.codeStats 0ms

/block/linux-aio.c

https://gitlab.com/storedmirrors/qemu
C | 486 lines | 319 code | 70 blank | 97 comment | 37 complexity | 3edf5506ed0b13de336444f9a68a50e0 MD5 | raw file
  1. /*
  2. * Linux native AIO support.
  3. *
  4. * Copyright (C) 2009 IBM, Corp.
  5. * Copyright (C) 2009 Red Hat, Inc.
  6. *
  7. * This work is licensed under the terms of the GNU GPL, version 2 or later.
  8. * See the COPYING file in the top-level directory.
  9. */
  10. #include "qemu/osdep.h"
  11. #include "block/aio.h"
  12. #include "qemu/queue.h"
  13. #include "block/block.h"
  14. #include "block/raw-aio.h"
  15. #include "qemu/event_notifier.h"
  16. #include "qemu/coroutine.h"
  17. #include "qapi/error.h"
  18. #include <libaio.h>
  19. /*
  20. * Queue size (per-device).
  21. *
  22. * XXX: eventually we need to communicate this to the guest and/or make it
  23. * tunable by the guest. If we get more outstanding requests at a time
  24. * than this we will get EAGAIN from io_submit which is communicated to
  25. * the guest as an I/O error.
  26. */
  27. #define MAX_EVENTS 1024
  28. /* Maximum number of requests in a batch. (default value) */
  29. #define DEFAULT_MAX_BATCH 32
  30. struct qemu_laiocb {
  31. Coroutine *co;
  32. LinuxAioState *ctx;
  33. struct iocb iocb;
  34. ssize_t ret;
  35. size_t nbytes;
  36. QEMUIOVector *qiov;
  37. bool is_read;
  38. QSIMPLEQ_ENTRY(qemu_laiocb) next;
  39. };
  40. typedef struct {
  41. int plugged;
  42. unsigned int in_queue;
  43. unsigned int in_flight;
  44. bool blocked;
  45. QSIMPLEQ_HEAD(, qemu_laiocb) pending;
  46. } LaioQueue;
  47. struct LinuxAioState {
  48. AioContext *aio_context;
  49. io_context_t ctx;
  50. EventNotifier e;
  51. /* io queue for submit at batch. Protected by AioContext lock. */
  52. LaioQueue io_q;
  53. /* I/O completion processing. Only runs in I/O thread. */
  54. QEMUBH *completion_bh;
  55. int event_idx;
  56. int event_max;
  57. };
  58. static void ioq_submit(LinuxAioState *s);
  59. static inline ssize_t io_event_ret(struct io_event *ev)
  60. {
  61. return (ssize_t)(((uint64_t)ev->res2 << 32) | ev->res);
  62. }
  63. /*
  64. * Completes an AIO request.
  65. */
  66. static void qemu_laio_process_completion(struct qemu_laiocb *laiocb)
  67. {
  68. int ret;
  69. ret = laiocb->ret;
  70. if (ret != -ECANCELED) {
  71. if (ret == laiocb->nbytes) {
  72. ret = 0;
  73. } else if (ret >= 0) {
  74. /* Short reads mean EOF, pad with zeros. */
  75. if (laiocb->is_read) {
  76. qemu_iovec_memset(laiocb->qiov, ret, 0,
  77. laiocb->qiov->size - ret);
  78. } else {
  79. ret = -ENOSPC;
  80. }
  81. }
  82. }
  83. laiocb->ret = ret;
  84. /*
  85. * If the coroutine is already entered it must be in ioq_submit() and
  86. * will notice laio->ret has been filled in when it eventually runs
  87. * later. Coroutines cannot be entered recursively so avoid doing
  88. * that!
  89. */
  90. if (!qemu_coroutine_entered(laiocb->co)) {
  91. aio_co_wake(laiocb->co);
  92. }
  93. }
  94. /**
  95. * aio_ring buffer which is shared between userspace and kernel.
  96. *
  97. * This copied from linux/fs/aio.c, common header does not exist
  98. * but AIO exists for ages so we assume ABI is stable.
  99. */
  100. struct aio_ring {
  101. unsigned id; /* kernel internal index number */
  102. unsigned nr; /* number of io_events */
  103. unsigned head; /* Written to by userland or by kernel. */
  104. unsigned tail;
  105. unsigned magic;
  106. unsigned compat_features;
  107. unsigned incompat_features;
  108. unsigned header_length; /* size of aio_ring */
  109. struct io_event io_events[];
  110. };
  111. /**
  112. * io_getevents_peek:
  113. * @ctx: AIO context
  114. * @events: pointer on events array, output value
  115. * Returns the number of completed events and sets a pointer
  116. * on events array. This function does not update the internal
  117. * ring buffer, only reads head and tail. When @events has been
  118. * processed io_getevents_commit() must be called.
  119. */
  120. static inline unsigned int io_getevents_peek(io_context_t ctx,
  121. struct io_event **events)
  122. {
  123. struct aio_ring *ring = (struct aio_ring *)ctx;
  124. unsigned int head = ring->head, tail = ring->tail;
  125. unsigned int nr;
  126. nr = tail >= head ? tail - head : ring->nr - head;
  127. *events = ring->io_events + head;
  128. /* To avoid speculative loads of s->events[i] before observing tail.
  129. Paired with smp_wmb() inside linux/fs/aio.c: aio_complete(). */
  130. smp_rmb();
  131. return nr;
  132. }
  133. /**
  134. * io_getevents_commit:
  135. * @ctx: AIO context
  136. * @nr: the number of events on which head should be advanced
  137. *
  138. * Advances head of a ring buffer.
  139. */
  140. static inline void io_getevents_commit(io_context_t ctx, unsigned int nr)
  141. {
  142. struct aio_ring *ring = (struct aio_ring *)ctx;
  143. if (nr) {
  144. ring->head = (ring->head + nr) % ring->nr;
  145. }
  146. }
  147. /**
  148. * io_getevents_advance_and_peek:
  149. * @ctx: AIO context
  150. * @events: pointer on events array, output value
  151. * @nr: the number of events on which head should be advanced
  152. *
  153. * Advances head of a ring buffer and returns number of elements left.
  154. */
  155. static inline unsigned int
  156. io_getevents_advance_and_peek(io_context_t ctx,
  157. struct io_event **events,
  158. unsigned int nr)
  159. {
  160. io_getevents_commit(ctx, nr);
  161. return io_getevents_peek(ctx, events);
  162. }
  163. /**
  164. * qemu_laio_process_completions:
  165. * @s: AIO state
  166. *
  167. * Fetches completed I/O requests and invokes their callbacks.
  168. *
  169. * The function is somewhat tricky because it supports nested event loops, for
  170. * example when a request callback invokes aio_poll(). In order to do this,
  171. * indices are kept in LinuxAioState. Function schedules BH completion so it
  172. * can be called again in a nested event loop. When there are no events left
  173. * to complete the BH is being canceled.
  174. */
  175. static void qemu_laio_process_completions(LinuxAioState *s)
  176. {
  177. struct io_event *events;
  178. /* Reschedule so nested event loops see currently pending completions */
  179. qemu_bh_schedule(s->completion_bh);
  180. while ((s->event_max = io_getevents_advance_and_peek(s->ctx, &events,
  181. s->event_idx))) {
  182. for (s->event_idx = 0; s->event_idx < s->event_max; ) {
  183. struct iocb *iocb = events[s->event_idx].obj;
  184. struct qemu_laiocb *laiocb =
  185. container_of(iocb, struct qemu_laiocb, iocb);
  186. laiocb->ret = io_event_ret(&events[s->event_idx]);
  187. /* Change counters one-by-one because we can be nested. */
  188. s->io_q.in_flight--;
  189. s->event_idx++;
  190. qemu_laio_process_completion(laiocb);
  191. }
  192. }
  193. qemu_bh_cancel(s->completion_bh);
  194. /* If we are nested we have to notify the level above that we are done
  195. * by setting event_max to zero, upper level will then jump out of it's
  196. * own `for` loop. If we are the last all counters droped to zero. */
  197. s->event_max = 0;
  198. s->event_idx = 0;
  199. }
  200. static void qemu_laio_process_completions_and_submit(LinuxAioState *s)
  201. {
  202. aio_context_acquire(s->aio_context);
  203. qemu_laio_process_completions(s);
  204. if (!s->io_q.plugged && !QSIMPLEQ_EMPTY(&s->io_q.pending)) {
  205. ioq_submit(s);
  206. }
  207. aio_context_release(s->aio_context);
  208. }
  209. static void qemu_laio_completion_bh(void *opaque)
  210. {
  211. LinuxAioState *s = opaque;
  212. qemu_laio_process_completions_and_submit(s);
  213. }
  214. static void qemu_laio_completion_cb(EventNotifier *e)
  215. {
  216. LinuxAioState *s = container_of(e, LinuxAioState, e);
  217. if (event_notifier_test_and_clear(&s->e)) {
  218. qemu_laio_process_completions_and_submit(s);
  219. }
  220. }
  221. static bool qemu_laio_poll_cb(void *opaque)
  222. {
  223. EventNotifier *e = opaque;
  224. LinuxAioState *s = container_of(e, LinuxAioState, e);
  225. struct io_event *events;
  226. return io_getevents_peek(s->ctx, &events);
  227. }
  228. static void qemu_laio_poll_ready(EventNotifier *opaque)
  229. {
  230. EventNotifier *e = opaque;
  231. LinuxAioState *s = container_of(e, LinuxAioState, e);
  232. qemu_laio_process_completions_and_submit(s);
  233. }
  234. static void ioq_init(LaioQueue *io_q)
  235. {
  236. QSIMPLEQ_INIT(&io_q->pending);
  237. io_q->plugged = 0;
  238. io_q->in_queue = 0;
  239. io_q->in_flight = 0;
  240. io_q->blocked = false;
  241. }
  242. static void ioq_submit(LinuxAioState *s)
  243. {
  244. int ret, len;
  245. struct qemu_laiocb *aiocb;
  246. struct iocb *iocbs[MAX_EVENTS];
  247. QSIMPLEQ_HEAD(, qemu_laiocb) completed;
  248. do {
  249. if (s->io_q.in_flight >= MAX_EVENTS) {
  250. break;
  251. }
  252. len = 0;
  253. QSIMPLEQ_FOREACH(aiocb, &s->io_q.pending, next) {
  254. iocbs[len++] = &aiocb->iocb;
  255. if (s->io_q.in_flight + len >= MAX_EVENTS) {
  256. break;
  257. }
  258. }
  259. ret = io_submit(s->ctx, len, iocbs);
  260. if (ret == -EAGAIN) {
  261. break;
  262. }
  263. if (ret < 0) {
  264. /* Fail the first request, retry the rest */
  265. aiocb = QSIMPLEQ_FIRST(&s->io_q.pending);
  266. QSIMPLEQ_REMOVE_HEAD(&s->io_q.pending, next);
  267. s->io_q.in_queue--;
  268. aiocb->ret = ret;
  269. qemu_laio_process_completion(aiocb);
  270. continue;
  271. }
  272. s->io_q.in_flight += ret;
  273. s->io_q.in_queue -= ret;
  274. aiocb = container_of(iocbs[ret - 1], struct qemu_laiocb, iocb);
  275. QSIMPLEQ_SPLIT_AFTER(&s->io_q.pending, aiocb, next, &completed);
  276. } while (ret == len && !QSIMPLEQ_EMPTY(&s->io_q.pending));
  277. s->io_q.blocked = (s->io_q.in_queue > 0);
  278. if (s->io_q.in_flight) {
  279. /* We can try to complete something just right away if there are
  280. * still requests in-flight. */
  281. qemu_laio_process_completions(s);
  282. /*
  283. * Even we have completed everything (in_flight == 0), the queue can
  284. * have still pended requests (in_queue > 0). We do not attempt to
  285. * repeat submission to avoid IO hang. The reason is simple: s->e is
  286. * still set and completion callback will be called shortly and all
  287. * pended requests will be submitted from there.
  288. */
  289. }
  290. }
  291. static uint64_t laio_max_batch(LinuxAioState *s, uint64_t dev_max_batch)
  292. {
  293. uint64_t max_batch = s->aio_context->aio_max_batch ?: DEFAULT_MAX_BATCH;
  294. /*
  295. * AIO context can be shared between multiple block devices, so
  296. * `dev_max_batch` allows reducing the batch size for latency-sensitive
  297. * devices.
  298. */
  299. max_batch = MIN_NON_ZERO(dev_max_batch, max_batch);
  300. /* limit the batch with the number of available events */
  301. max_batch = MIN_NON_ZERO(MAX_EVENTS - s->io_q.in_flight, max_batch);
  302. return max_batch;
  303. }
  304. void laio_io_plug(BlockDriverState *bs, LinuxAioState *s)
  305. {
  306. s->io_q.plugged++;
  307. }
  308. void laio_io_unplug(BlockDriverState *bs, LinuxAioState *s,
  309. uint64_t dev_max_batch)
  310. {
  311. assert(s->io_q.plugged);
  312. if (s->io_q.in_queue >= laio_max_batch(s, dev_max_batch) ||
  313. (--s->io_q.plugged == 0 &&
  314. !s->io_q.blocked && !QSIMPLEQ_EMPTY(&s->io_q.pending))) {
  315. ioq_submit(s);
  316. }
  317. }
  318. static int laio_do_submit(int fd, struct qemu_laiocb *laiocb, off_t offset,
  319. int type, uint64_t dev_max_batch)
  320. {
  321. LinuxAioState *s = laiocb->ctx;
  322. struct iocb *iocbs = &laiocb->iocb;
  323. QEMUIOVector *qiov = laiocb->qiov;
  324. switch (type) {
  325. case QEMU_AIO_WRITE:
  326. io_prep_pwritev(iocbs, fd, qiov->iov, qiov->niov, offset);
  327. break;
  328. case QEMU_AIO_READ:
  329. io_prep_preadv(iocbs, fd, qiov->iov, qiov->niov, offset);
  330. break;
  331. /* Currently Linux kernel does not support other operations */
  332. default:
  333. fprintf(stderr, "%s: invalid AIO request type 0x%x.\n",
  334. __func__, type);
  335. return -EIO;
  336. }
  337. io_set_eventfd(&laiocb->iocb, event_notifier_get_fd(&s->e));
  338. QSIMPLEQ_INSERT_TAIL(&s->io_q.pending, laiocb, next);
  339. s->io_q.in_queue++;
  340. if (!s->io_q.blocked &&
  341. (!s->io_q.plugged ||
  342. s->io_q.in_queue >= laio_max_batch(s, dev_max_batch))) {
  343. ioq_submit(s);
  344. }
  345. return 0;
  346. }
  347. int coroutine_fn laio_co_submit(BlockDriverState *bs, LinuxAioState *s, int fd,
  348. uint64_t offset, QEMUIOVector *qiov, int type,
  349. uint64_t dev_max_batch)
  350. {
  351. int ret;
  352. struct qemu_laiocb laiocb = {
  353. .co = qemu_coroutine_self(),
  354. .nbytes = qiov->size,
  355. .ctx = s,
  356. .ret = -EINPROGRESS,
  357. .is_read = (type == QEMU_AIO_READ),
  358. .qiov = qiov,
  359. };
  360. ret = laio_do_submit(fd, &laiocb, offset, type, dev_max_batch);
  361. if (ret < 0) {
  362. return ret;
  363. }
  364. if (laiocb.ret == -EINPROGRESS) {
  365. qemu_coroutine_yield();
  366. }
  367. return laiocb.ret;
  368. }
  369. void laio_detach_aio_context(LinuxAioState *s, AioContext *old_context)
  370. {
  371. aio_set_event_notifier(old_context, &s->e, false, NULL, NULL, NULL);
  372. qemu_bh_delete(s->completion_bh);
  373. s->aio_context = NULL;
  374. }
  375. void laio_attach_aio_context(LinuxAioState *s, AioContext *new_context)
  376. {
  377. s->aio_context = new_context;
  378. s->completion_bh = aio_bh_new(new_context, qemu_laio_completion_bh, s);
  379. aio_set_event_notifier(new_context, &s->e, false,
  380. qemu_laio_completion_cb,
  381. qemu_laio_poll_cb,
  382. qemu_laio_poll_ready);
  383. }
  384. LinuxAioState *laio_init(Error **errp)
  385. {
  386. int rc;
  387. LinuxAioState *s;
  388. s = g_malloc0(sizeof(*s));
  389. rc = event_notifier_init(&s->e, false);
  390. if (rc < 0) {
  391. error_setg_errno(errp, -rc, "failed to to initialize event notifier");
  392. goto out_free_state;
  393. }
  394. rc = io_setup(MAX_EVENTS, &s->ctx);
  395. if (rc < 0) {
  396. error_setg_errno(errp, -rc, "failed to create linux AIO context");
  397. goto out_close_efd;
  398. }
  399. ioq_init(&s->io_q);
  400. return s;
  401. out_close_efd:
  402. event_notifier_cleanup(&s->e);
  403. out_free_state:
  404. g_free(s);
  405. return NULL;
  406. }
  407. void laio_cleanup(LinuxAioState *s)
  408. {
  409. event_notifier_cleanup(&s->e);
  410. if (io_destroy(s->ctx) != 0) {
  411. fprintf(stderr, "%s: destroy AIO context %p failed\n",
  412. __func__, &s->ctx);
  413. }
  414. g_free(s);
  415. }