PageRenderTime 39ms CodeModel.GetById 13ms RepoModel.GetById 0ms app.codeStats 0ms

/mordor/iomanager_epoll.cpp

http://github.com/mozy/mordor
C++ | 506 lines | 438 code | 45 blank | 23 comment | 103 complexity | 2f2cd80aca4fe1caa754112e370e4df3 MD5 | raw file
Possible License(s): BSD-3-Clause
  1. // Copyright (c) 2009 - Mozy, Inc.
  2. #include "pch.h"
  3. #ifdef LINUX
  4. #include "iomanager_epoll.h"
  5. #include <sys/epoll.h>
  6. #include <boost/exception_ptr.hpp>
  7. #include "assert.h"
  8. #include "atomic.h"
  9. #include "fiber.h"
  10. // EPOLLRDHUP is missing in the header on etch
  11. #ifndef EPOLLRDHUP
  12. #define EPOLLRDHUP 0x2000
  13. #endif
  14. namespace Mordor {
  15. static Logger::ptr g_log = Log::lookup("mordor:iomanager");
  16. enum epoll_ctl_op_t
  17. {
  18. epoll_ctl_op_t_dummy = 0x7ffffff
  19. };
  20. static std::ostream &operator <<(std::ostream &os, epoll_ctl_op_t op)
  21. {
  22. switch ((int)op) {
  23. case EPOLL_CTL_ADD:
  24. return os << "EPOLL_CTL_ADD";
  25. case EPOLL_CTL_MOD:
  26. return os << "EPOLL_CTL_MOD";
  27. case EPOLL_CTL_DEL:
  28. return os << "EPOLL_CTL_DEL";
  29. default:
  30. return os << (int)op;
  31. }
  32. }
  33. static std::ostream &operator <<(std::ostream &os, EPOLL_EVENTS events)
  34. {
  35. if (!events) {
  36. return os << '0';
  37. }
  38. bool one = false;
  39. if (events & EPOLLIN) {
  40. os << "EPOLLIN";
  41. one = true;
  42. }
  43. if (events & EPOLLOUT) {
  44. if (one) os << " | ";
  45. os << "EPOLLOUT";
  46. one = true;
  47. }
  48. if (events & EPOLLPRI) {
  49. if (one) os << " | ";
  50. os << "EPOLLPRI";
  51. one = true;
  52. }
  53. if (events & EPOLLERR) {
  54. if (one) os << " | ";
  55. os << "EPOLLERR";
  56. one = true;
  57. }
  58. if (events & EPOLLHUP) {
  59. if (one) os << " | ";
  60. os << "EPOLLHUP";
  61. one = true;
  62. }
  63. if (events & EPOLLET) {
  64. if (one) os << " | ";
  65. os << "EPOLLET";
  66. one = true;
  67. }
  68. if (events & EPOLLONESHOT) {
  69. if (one) os << " | ";
  70. os << "EPOLLONESHOT";
  71. one = true;
  72. }
  73. if (events & EPOLLRDHUP) {
  74. if (one) os << " | ";
  75. os << "EPOLLRDHUP";
  76. one = true;
  77. }
  78. events = (EPOLL_EVENTS)(events & ~(EPOLLIN | EPOLLOUT | EPOLLPRI | EPOLLERR | EPOLLHUP | EPOLLET | EPOLLONESHOT | EPOLLRDHUP));
  79. if (events) {
  80. if (one) os << " | ";
  81. os << (uint32_t)events;
  82. }
  83. return os;
  84. }
  85. IOManager::AsyncState::AsyncState()
  86. : m_fd(0),
  87. m_events(NONE)
  88. {}
  89. IOManager::AsyncState::~AsyncState()
  90. {
  91. boost::mutex::scoped_lock lock(m_mutex);
  92. MORDOR_NOTHROW_ASSERT(!m_events);
  93. }
  94. IOManager::AsyncState::EventContext &
  95. IOManager::AsyncState::contextForEvent(Event event)
  96. {
  97. switch (event) {
  98. case READ:
  99. return m_in;
  100. case WRITE:
  101. return m_out;
  102. case CLOSE:
  103. return m_close;
  104. default:
  105. MORDOR_NOTREACHED();
  106. }
  107. }
  108. bool
  109. IOManager::AsyncState::triggerEvent(Event event, size_t &pendingEventCount)
  110. {
  111. if (!(m_events & event))
  112. return false;
  113. m_events = (Event)(m_events & ~event);
  114. atomicDecrement(pendingEventCount);
  115. EventContext &context = contextForEvent(event);
  116. if (context.dg) {
  117. context.scheduler->schedule(&context.dg);
  118. } else {
  119. context.scheduler->schedule(&context.fiber);
  120. }
  121. context.scheduler = NULL;
  122. return true;
  123. }
  124. void
  125. IOManager::AsyncState::asyncResetContext(AsyncState::EventContext& context)
  126. {
  127. // fiber.reset is not necessary to be running under the lock.
  128. // However, it is needed to acquire the lock and then unlock
  129. // to ensure that this function is executed after the other
  130. // fiber which scheduled this async reset call.
  131. boost::mutex::scoped_lock lock(m_mutex);
  132. lock.unlock();
  133. context.fiber.reset();
  134. context.dg = NULL;
  135. }
  136. void
  137. IOManager::AsyncState::resetContext(EventContext &context)
  138. {
  139. // asynchronously reset fiber/dg to avoid destroying in IOManager::idle
  140. // NOTE: this function has the pre-condition that the m_mutex is
  141. // already acquired in upper level (which is true right now), in this
  142. // way, the asyncReset will not be executed until the m_mutex is released,
  143. // and it is surely run in Scheduler working fiber instead of idle fiber.
  144. // it is fine to pass context address to the boost function
  145. // since the address will be always valid until ~IOManager()
  146. context.scheduler->schedule(boost::bind(
  147. &IOManager::AsyncState::asyncResetContext, this, context));
  148. context.scheduler = NULL;
  149. context.fiber.reset();
  150. context.dg = NULL;
  151. }
  152. IOManager::IOManager(size_t threads, bool useCaller, bool autoStart, size_t batchSize)
  153. : Scheduler(threads, useCaller, batchSize),
  154. m_pendingEventCount(0)
  155. {
  156. m_epfd = epoll_create(5000);
  157. MORDOR_LOG_LEVEL(g_log, m_epfd <= 0 ? Log::ERROR : Log::TRACE) << this
  158. << " epoll_create(5000): " << m_epfd;
  159. if (m_epfd <= 0)
  160. MORDOR_THROW_EXCEPTION_FROM_LAST_ERROR_API("epoll_create");
  161. int rc = pipe(m_tickleFds);
  162. MORDOR_LOG_LEVEL(g_log, rc ? Log::ERROR : Log::VERBOSE) << this << " pipe(): "
  163. << rc << " (" << lastError() << ")";
  164. if (rc) {
  165. close(m_epfd);
  166. MORDOR_THROW_EXCEPTION_FROM_LAST_ERROR_API("pipe");
  167. }
  168. MORDOR_ASSERT(m_tickleFds[0] > 0);
  169. MORDOR_ASSERT(m_tickleFds[1] > 0);
  170. epoll_event event;
  171. memset(&event, 0, sizeof(epoll_event));
  172. event.events = EPOLLIN | EPOLLET;
  173. event.data.fd = m_tickleFds[0];
  174. rc = fcntl(m_tickleFds[0], F_SETFL, O_NONBLOCK);
  175. if (rc == -1) {
  176. close(m_tickleFds[0]);
  177. close(m_tickleFds[1]);
  178. close(m_epfd);
  179. MORDOR_THROW_EXCEPTION_FROM_LAST_ERROR_API("fcntl");
  180. }
  181. rc = fcntl(m_tickleFds[1], F_SETFL, O_NONBLOCK);
  182. if (rc == -1) {
  183. close(m_tickleFds[0]);
  184. close(m_tickleFds[1]);
  185. close(m_epfd);
  186. MORDOR_THROW_EXCEPTION_FROM_LAST_ERROR_API("fcntl");
  187. }
  188. rc = epoll_ctl(m_epfd, EPOLL_CTL_ADD, m_tickleFds[0], &event);
  189. MORDOR_LOG_LEVEL(g_log, rc ? Log::ERROR : Log::VERBOSE) << this
  190. << " epoll_ctl(" << m_epfd << ", EPOLL_CTL_ADD, " << m_tickleFds[0]
  191. << ", EPOLLIN | EPOLLET): " << rc << " (" << lastError() << ")";
  192. if (rc) {
  193. close(m_tickleFds[0]);
  194. close(m_tickleFds[1]);
  195. close(m_epfd);
  196. MORDOR_THROW_EXCEPTION_FROM_LAST_ERROR_API("epoll_ctl");
  197. }
  198. if (autoStart) {
  199. try {
  200. start();
  201. } catch (...) {
  202. close(m_tickleFds[0]);
  203. close(m_tickleFds[1]);
  204. close(m_epfd);
  205. throw;
  206. }
  207. }
  208. }
  209. IOManager::~IOManager()
  210. {
  211. stop();
  212. close(m_epfd);
  213. MORDOR_LOG_TRACE(g_log) << this << " close(" << m_epfd << ")";
  214. close(m_tickleFds[0]);
  215. MORDOR_LOG_VERBOSE(g_log) << this << " close(" << m_tickleFds[0] << ")";
  216. close(m_tickleFds[1]);
  217. // Yes, it would be more C++-esque to store a boost::shared_ptr in the
  218. // vector, but that requires an extra allocation per fd for the counter
  219. for (size_t i = 0; i < m_pendingEvents.size(); ++i) {
  220. if (m_pendingEvents[i])
  221. delete m_pendingEvents[i];
  222. }
  223. }
  224. bool
  225. IOManager::stopping()
  226. {
  227. unsigned long long timeout;
  228. return stopping(timeout);
  229. }
  230. void
  231. IOManager::registerEvent(int fd, Event event, boost::function<void ()> dg)
  232. {
  233. MORDOR_ASSERT(fd > 0);
  234. MORDOR_ASSERT(Scheduler::getThis());
  235. MORDOR_ASSERT(dg || Fiber::getThis());
  236. MORDOR_ASSERT(event == READ || event == WRITE || event == CLOSE);
  237. // Look up our state in the global map, expanding it if necessary
  238. boost::mutex::scoped_lock lock(m_mutex);
  239. if (m_pendingEvents.size() < (size_t)fd)
  240. m_pendingEvents.resize(fd * 3 / 2);
  241. if (!m_pendingEvents[fd - 1]) {
  242. m_pendingEvents[fd - 1] = new AsyncState();
  243. m_pendingEvents[fd - 1]->m_fd = fd;
  244. }
  245. AsyncState &state = *m_pendingEvents[fd - 1];
  246. MORDOR_ASSERT(fd == state.m_fd);
  247. lock.unlock();
  248. boost::mutex::scoped_lock lock2(state.m_mutex);
  249. MORDOR_ASSERT(!(state.m_events & event));
  250. int op = state.m_events ? EPOLL_CTL_MOD : EPOLL_CTL_ADD;
  251. epoll_event epevent;
  252. epevent.events = EPOLLET | state.m_events | event;
  253. epevent.data.ptr = &state;
  254. int rc = epoll_ctl(m_epfd, op, fd, &epevent);
  255. MORDOR_LOG_LEVEL(g_log, rc ? Log::ERROR : Log::VERBOSE) << this
  256. << " epoll_ctl(" << m_epfd << ", " << (epoll_ctl_op_t)op << ", "
  257. << fd << ", " << (EPOLL_EVENTS)epevent.events << "): " << rc
  258. << " (" << lastError() << ")";
  259. if (rc)
  260. MORDOR_THROW_EXCEPTION_FROM_LAST_ERROR_API("epoll_ctl");
  261. atomicIncrement(m_pendingEventCount);
  262. state.m_events = (Event)(state.m_events | event);
  263. AsyncState::EventContext &context = state.contextForEvent(event);
  264. MORDOR_ASSERT(!context.scheduler);
  265. MORDOR_ASSERT(!context.fiber);
  266. MORDOR_ASSERT(!context.dg);
  267. context.scheduler = Scheduler::getThis();
  268. if (dg) {
  269. context.dg.swap(dg);
  270. } else {
  271. context.fiber = Fiber::getThis();
  272. }
  273. }
  274. bool
  275. IOManager::unregisterEvent(int fd, Event event)
  276. {
  277. MORDOR_ASSERT(fd > 0);
  278. MORDOR_ASSERT(event == READ || event == WRITE || event == CLOSE);
  279. boost::mutex::scoped_lock lock(m_mutex);
  280. if (m_pendingEvents.size() < (size_t)fd)
  281. return false;
  282. if (!m_pendingEvents[fd - 1])
  283. return false;
  284. AsyncState &state = *m_pendingEvents[fd - 1];
  285. MORDOR_ASSERT(fd == state.m_fd);
  286. lock.unlock();
  287. boost::mutex::scoped_lock lock2(state.m_mutex);
  288. if (!(state.m_events & event))
  289. return false;
  290. MORDOR_ASSERT(fd == state.m_fd);
  291. Event newEvents = (Event)(state.m_events &~event);
  292. int op = newEvents ? EPOLL_CTL_MOD : EPOLL_CTL_DEL;
  293. epoll_event epevent;
  294. epevent.events = EPOLLET | newEvents;
  295. epevent.data.ptr = &state;
  296. int rc = epoll_ctl(m_epfd, op, fd, &epevent);
  297. MORDOR_LOG_LEVEL(g_log, rc ? Log::ERROR : Log::VERBOSE) << this
  298. << " epoll_ctl(" << m_epfd << ", " << (epoll_ctl_op_t)op << ", "
  299. << fd << ", " << (EPOLL_EVENTS)epevent.events << "): " << rc
  300. << " (" << lastError() << ")";
  301. if (rc)
  302. MORDOR_THROW_EXCEPTION_FROM_LAST_ERROR_API("epoll_ctl");
  303. atomicDecrement(m_pendingEventCount);
  304. state.m_events = newEvents;
  305. AsyncState::EventContext &context = state.contextForEvent(event);
  306. // spawn a dedicated fiber to do the cleanup
  307. state.resetContext(context);
  308. return true;
  309. }
  310. bool
  311. IOManager::cancelEvent(int fd, Event event)
  312. {
  313. MORDOR_ASSERT(fd > 0);
  314. MORDOR_ASSERT(event == READ || event == WRITE || event == CLOSE);
  315. boost::mutex::scoped_lock lock(m_mutex);
  316. if (m_pendingEvents.size() < (size_t)fd)
  317. return false;
  318. if (!m_pendingEvents[fd - 1])
  319. return false;
  320. AsyncState &state = *m_pendingEvents[fd - 1];
  321. MORDOR_ASSERT(fd == state.m_fd);
  322. lock.unlock();
  323. boost::mutex::scoped_lock lock2(state.m_mutex);
  324. if (!(state.m_events & event))
  325. return false;
  326. MORDOR_ASSERT(fd == state.m_fd);
  327. Event newEvents = (Event)(state.m_events &~event);
  328. int op = newEvents ? EPOLL_CTL_MOD : EPOLL_CTL_DEL;
  329. epoll_event epevent;
  330. epevent.events = EPOLLET | newEvents;
  331. epevent.data.ptr = &state;
  332. int rc = epoll_ctl(m_epfd, op, fd, &epevent);
  333. MORDOR_LOG_LEVEL(g_log, rc ? Log::ERROR : Log::VERBOSE) << this
  334. << " epoll_ctl(" << m_epfd << ", " << (epoll_ctl_op_t)op << ", "
  335. << fd << ", " << (EPOLL_EVENTS)epevent.events << "): " << rc
  336. << " (" << lastError() << ")";
  337. if (rc)
  338. MORDOR_THROW_EXCEPTION_FROM_LAST_ERROR_API("epoll_ctl");
  339. state.triggerEvent(event, m_pendingEventCount);
  340. return true;
  341. }
  342. bool
  343. IOManager::stopping(unsigned long long &nextTimeout)
  344. {
  345. nextTimeout = nextTimer();
  346. return nextTimeout == ~0ull && Scheduler::stopping() &&
  347. m_pendingEventCount == 0;
  348. }
  349. void
  350. IOManager::idle()
  351. {
  352. epoll_event events[64];
  353. while (true) {
  354. unsigned long long nextTimeout;
  355. if (stopping(nextTimeout))
  356. return;
  357. int rc;
  358. int timeout;
  359. do {
  360. if (nextTimeout != ~0ull)
  361. timeout = (int)(nextTimeout / 1000) + 1;
  362. else
  363. timeout = -1;
  364. rc = epoll_wait(m_epfd, events, 64, timeout);
  365. if (rc < 0 && errno == EINTR)
  366. nextTimeout = nextTimer();
  367. else
  368. break;
  369. } while (true);
  370. MORDOR_LOG_LEVEL(g_log, rc < 0 ? Log::ERROR : Log::VERBOSE) << this
  371. << " epoll_wait(" << m_epfd << ", 64, " << timeout << "): " << rc
  372. << " (" << lastError() << ")";
  373. if (rc < 0)
  374. MORDOR_THROW_EXCEPTION_FROM_LAST_ERROR_API("epoll_wait");
  375. std::vector<boost::function<void ()> > expired = processTimers();
  376. if (!expired.empty()) {
  377. schedule(expired.begin(), expired.end());
  378. expired.clear();
  379. }
  380. boost::exception_ptr exception;
  381. for(int i = 0; i < rc; ++i) {
  382. epoll_event &event = events[i];
  383. if (event.data.fd == m_tickleFds[0]) {
  384. unsigned char dummy[256];
  385. int rc2;
  386. // every tickle write only 1 byte
  387. // but it does not have to be read by 1 byte
  388. // try to read more to save read() syscall
  389. while((rc2 = read(m_tickleFds[0], dummy, 256)) > 0) {
  390. MORDOR_LOG_VERBOSE(g_log) << this << " received " << rc2 << " tickles";
  391. }
  392. MORDOR_VERIFY(rc2 < 0 && errno == EAGAIN);
  393. continue;
  394. }
  395. AsyncState &state = *(AsyncState *)event.data.ptr;
  396. boost::mutex::scoped_lock lock2(state.m_mutex);
  397. MORDOR_LOG_TRACE(g_log) << " epoll_event {"
  398. << (EPOLL_EVENTS)event.events << ", " << state.m_fd
  399. << "}, registered for " << (EPOLL_EVENTS)state.m_events;
  400. if (event.events & (EPOLLERR | EPOLLHUP))
  401. event.events |= EPOLLIN | EPOLLOUT;
  402. int incomingEvents = NONE;
  403. if (event.events & EPOLLIN)
  404. incomingEvents = READ;
  405. if (event.events & EPOLLOUT)
  406. incomingEvents |= WRITE;
  407. if (event.events & EPOLLRDHUP)
  408. incomingEvents |= CLOSE;
  409. // Nothing will be triggered, probably because a prior cancelEvent call
  410. // (probably on a different thread) already triggered it, so no
  411. // need to tell epoll anything
  412. if ((state.m_events & incomingEvents) == NONE)
  413. continue;
  414. int remainingEvents = (state.m_events & ~incomingEvents);
  415. int op = remainingEvents ? EPOLL_CTL_MOD : EPOLL_CTL_DEL;
  416. event.events = EPOLLET | remainingEvents;
  417. int rc2 = epoll_ctl(m_epfd, op, state.m_fd, &event);
  418. MORDOR_LOG_LEVEL(g_log, rc2 ? Log::ERROR : Log::VERBOSE) << this
  419. << " epoll_ctl(" << m_epfd << ", " << (epoll_ctl_op_t)op << ", "
  420. << state.m_fd << ", " << (EPOLL_EVENTS)event.events << "): " << rc2
  421. << " (" << lastError() << ")";
  422. if (rc2) {
  423. try {
  424. MORDOR_THROW_EXCEPTION_FROM_LAST_ERROR_API("epoll_ctl");
  425. } catch (boost::exception &) {
  426. exception = boost::current_exception();
  427. continue;
  428. }
  429. }
  430. bool triggered = false;
  431. if (incomingEvents & READ)
  432. triggered = state.triggerEvent(READ, m_pendingEventCount);
  433. if (incomingEvents & WRITE)
  434. triggered = state.triggerEvent(WRITE, m_pendingEventCount) || triggered;
  435. if (incomingEvents & CLOSE)
  436. triggered = state.triggerEvent(CLOSE, m_pendingEventCount) || triggered;
  437. MORDOR_ASSERT(triggered);
  438. }
  439. if (exception)
  440. boost::rethrow_exception(exception);
  441. try {
  442. Fiber::yield();
  443. } catch (OperationAbortedException &) {
  444. return;
  445. }
  446. }
  447. }
  448. void
  449. IOManager::tickle()
  450. {
  451. if (!hasIdleThreads()) {
  452. MORDOR_LOG_VERBOSE(g_log) << this << " 0 idle thread, no tickle.";
  453. return;
  454. }
  455. int rc = write(m_tickleFds[1], "T", 1);
  456. MORDOR_LOG_VERBOSE(g_log) << this << " write(" << m_tickleFds[1] << ", 1): "
  457. << rc << " (" << lastError() << ")";
  458. MORDOR_VERIFY(rc == 1 || (rc < 0 && errno == EAGAIN));
  459. }
  460. }
  461. #endif