PageRenderTime 208ms CodeModel.GetById 81ms app.highlight 88ms RepoModel.GetById 32ms app.codeStats 1ms

/mordor/iomanager_epoll.cpp

http://github.com/mozy/mordor
C++ | 506 lines | 438 code | 45 blank | 23 comment | 103 complexity | 2f2cd80aca4fe1caa754112e370e4df3 MD5 | raw file
  1// Copyright (c) 2009 - Mozy, Inc.
  2
  3#include "pch.h"
  4
  5#ifdef LINUX
  6
  7#include "iomanager_epoll.h"
  8
  9#include <sys/epoll.h>
 10
 11#include <boost/exception_ptr.hpp>
 12
 13#include "assert.h"
 14#include "atomic.h"
 15#include "fiber.h"
 16
 17// EPOLLRDHUP is missing in the header on etch
 18#ifndef EPOLLRDHUP
 19#define EPOLLRDHUP 0x2000
 20#endif
 21
 22namespace Mordor {
 23
 24static Logger::ptr g_log = Log::lookup("mordor:iomanager");
 25
 26enum epoll_ctl_op_t
 27{
 28    epoll_ctl_op_t_dummy = 0x7ffffff
 29};
 30
 31static std::ostream &operator <<(std::ostream &os, epoll_ctl_op_t op)
 32{
 33    switch ((int)op) {
 34        case EPOLL_CTL_ADD:
 35            return os << "EPOLL_CTL_ADD";
 36        case EPOLL_CTL_MOD:
 37            return os << "EPOLL_CTL_MOD";
 38        case EPOLL_CTL_DEL:
 39            return os << "EPOLL_CTL_DEL";
 40        default:
 41            return os << (int)op;
 42    }
 43}
 44
 45static std::ostream &operator <<(std::ostream &os, EPOLL_EVENTS events)
 46{
 47    if (!events) {
 48        return os << '0';
 49    }
 50    bool one = false;
 51    if (events & EPOLLIN) {
 52        os << "EPOLLIN";
 53        one = true;
 54    }
 55    if (events & EPOLLOUT) {
 56        if (one) os << " | ";
 57        os << "EPOLLOUT";
 58        one = true;
 59    }
 60    if (events & EPOLLPRI) {
 61        if (one) os << " | ";
 62        os << "EPOLLPRI";
 63        one = true;
 64    }
 65    if (events & EPOLLERR) {
 66        if (one) os << " | ";
 67        os << "EPOLLERR";
 68        one = true;
 69    }
 70    if (events & EPOLLHUP) {
 71        if (one) os << " | ";
 72        os << "EPOLLHUP";
 73        one = true;
 74    }
 75    if (events & EPOLLET) {
 76        if (one) os << " | ";
 77        os << "EPOLLET";
 78        one = true;
 79    }
 80    if (events & EPOLLONESHOT) {
 81        if (one) os << " | ";
 82        os << "EPOLLONESHOT";
 83        one = true;
 84    }
 85    if (events & EPOLLRDHUP) {
 86        if (one) os << " | ";
 87        os << "EPOLLRDHUP";
 88        one = true;
 89    }
 90    events = (EPOLL_EVENTS)(events & ~(EPOLLIN | EPOLLOUT | EPOLLPRI | EPOLLERR | EPOLLHUP | EPOLLET | EPOLLONESHOT | EPOLLRDHUP));
 91    if (events) {
 92        if (one) os << " | ";
 93        os << (uint32_t)events;
 94    }
 95    return os;
 96}
 97
 98IOManager::AsyncState::AsyncState()
 99    : m_fd(0),
100      m_events(NONE)
101{}
102
103IOManager::AsyncState::~AsyncState()
104{
105    boost::mutex::scoped_lock lock(m_mutex);
106    MORDOR_NOTHROW_ASSERT(!m_events);
107}
108
109IOManager::AsyncState::EventContext &
110IOManager::AsyncState::contextForEvent(Event event)
111{
112    switch (event) {
113        case READ:
114            return m_in;
115        case WRITE:
116            return m_out;
117        case CLOSE:
118            return m_close;
119        default:
120            MORDOR_NOTREACHED();
121    }
122}
123
124bool
125IOManager::AsyncState::triggerEvent(Event event, size_t &pendingEventCount)
126{
127    if (!(m_events & event))
128        return false;
129    m_events = (Event)(m_events & ~event);
130    atomicDecrement(pendingEventCount);
131    EventContext &context = contextForEvent(event);
132    if (context.dg) {
133        context.scheduler->schedule(&context.dg);
134    } else {
135        context.scheduler->schedule(&context.fiber);
136    }
137    context.scheduler = NULL;
138    return true;
139}
140
141void
142IOManager::AsyncState::asyncResetContext(AsyncState::EventContext& context)
143{
144    // fiber.reset is not necessary to be running under the lock.
145    // However, it is needed to acquire the lock and then unlock
146    // to ensure that this function is executed after the other
147    // fiber which scheduled this async reset call.
148    boost::mutex::scoped_lock lock(m_mutex);
149    lock.unlock();
150    context.fiber.reset();
151    context.dg = NULL;
152}
153
154void
155IOManager::AsyncState::resetContext(EventContext &context)
156{
157    // asynchronously reset fiber/dg to avoid destroying in IOManager::idle
158    // NOTE: this function has the pre-condition that the m_mutex is
159    // already acquired in upper level (which is true right now), in this
160    // way, the asyncReset will not be executed until the m_mutex is released,
161    // and it is surely run in Scheduler working fiber instead of idle fiber.
162    // it is fine to pass context address to the boost function
163    // since the address will be always valid until ~IOManager()
164    context.scheduler->schedule(boost::bind(
165        &IOManager::AsyncState::asyncResetContext, this, context));
166    context.scheduler = NULL;
167    context.fiber.reset();
168    context.dg = NULL;
169}
170
171IOManager::IOManager(size_t threads, bool useCaller, bool autoStart, size_t batchSize)
172    : Scheduler(threads, useCaller, batchSize),
173      m_pendingEventCount(0)
174{
175    m_epfd = epoll_create(5000);
176    MORDOR_LOG_LEVEL(g_log, m_epfd <= 0 ? Log::ERROR : Log::TRACE) << this
177        << " epoll_create(5000): " << m_epfd;
178    if (m_epfd <= 0)
179        MORDOR_THROW_EXCEPTION_FROM_LAST_ERROR_API("epoll_create");
180    int rc = pipe(m_tickleFds);
181    MORDOR_LOG_LEVEL(g_log, rc ? Log::ERROR : Log::VERBOSE) << this << " pipe(): "
182        << rc << " (" << lastError() << ")";
183    if (rc) {
184        close(m_epfd);
185        MORDOR_THROW_EXCEPTION_FROM_LAST_ERROR_API("pipe");
186    }
187    MORDOR_ASSERT(m_tickleFds[0] > 0);
188    MORDOR_ASSERT(m_tickleFds[1] > 0);
189    epoll_event event;
190    memset(&event, 0, sizeof(epoll_event));
191    event.events = EPOLLIN | EPOLLET;
192    event.data.fd = m_tickleFds[0];
193    rc = fcntl(m_tickleFds[0], F_SETFL, O_NONBLOCK);
194    if (rc == -1) {
195        close(m_tickleFds[0]);
196        close(m_tickleFds[1]);
197        close(m_epfd);
198        MORDOR_THROW_EXCEPTION_FROM_LAST_ERROR_API("fcntl");
199    }
200    rc = fcntl(m_tickleFds[1], F_SETFL, O_NONBLOCK);
201    if (rc == -1) {
202        close(m_tickleFds[0]);
203        close(m_tickleFds[1]);
204        close(m_epfd);
205        MORDOR_THROW_EXCEPTION_FROM_LAST_ERROR_API("fcntl");
206    }
207    rc = epoll_ctl(m_epfd, EPOLL_CTL_ADD, m_tickleFds[0], &event);
208    MORDOR_LOG_LEVEL(g_log, rc ? Log::ERROR : Log::VERBOSE) << this
209        << " epoll_ctl(" << m_epfd << ", EPOLL_CTL_ADD, " << m_tickleFds[0]
210        << ", EPOLLIN | EPOLLET): " << rc << " (" << lastError() << ")";
211    if (rc) {
212        close(m_tickleFds[0]);
213        close(m_tickleFds[1]);
214        close(m_epfd);
215        MORDOR_THROW_EXCEPTION_FROM_LAST_ERROR_API("epoll_ctl");
216    }
217    if (autoStart) {
218        try {
219            start();
220        } catch (...) {
221            close(m_tickleFds[0]);
222            close(m_tickleFds[1]);
223            close(m_epfd);
224            throw;
225        }
226    }
227}
228
229IOManager::~IOManager()
230{
231    stop();
232    close(m_epfd);
233    MORDOR_LOG_TRACE(g_log) << this << " close(" << m_epfd << ")";
234    close(m_tickleFds[0]);
235    MORDOR_LOG_VERBOSE(g_log) << this << " close(" << m_tickleFds[0] << ")";
236    close(m_tickleFds[1]);
237    // Yes, it would be more C++-esque to store a boost::shared_ptr in the
238    // vector, but that requires an extra allocation per fd for the counter
239    for (size_t i = 0; i < m_pendingEvents.size(); ++i) {
240        if (m_pendingEvents[i])
241            delete m_pendingEvents[i];
242    }
243}
244
245bool
246IOManager::stopping()
247{
248    unsigned long long timeout;
249    return stopping(timeout);
250}
251
252void
253IOManager::registerEvent(int fd, Event event, boost::function<void ()> dg)
254{
255    MORDOR_ASSERT(fd > 0);
256    MORDOR_ASSERT(Scheduler::getThis());
257    MORDOR_ASSERT(dg || Fiber::getThis());
258    MORDOR_ASSERT(event == READ || event == WRITE || event == CLOSE);
259
260    // Look up our state in the global map, expanding it if necessary
261    boost::mutex::scoped_lock lock(m_mutex);
262    if (m_pendingEvents.size() < (size_t)fd)
263        m_pendingEvents.resize(fd * 3 / 2);
264    if (!m_pendingEvents[fd - 1]) {
265        m_pendingEvents[fd - 1] = new AsyncState();
266        m_pendingEvents[fd - 1]->m_fd = fd;
267    }
268    AsyncState &state = *m_pendingEvents[fd - 1];
269    MORDOR_ASSERT(fd == state.m_fd);
270    lock.unlock();
271
272    boost::mutex::scoped_lock lock2(state.m_mutex);
273
274    MORDOR_ASSERT(!(state.m_events & event));
275    int op = state.m_events ? EPOLL_CTL_MOD : EPOLL_CTL_ADD;
276    epoll_event epevent;
277    epevent.events = EPOLLET | state.m_events | event;
278    epevent.data.ptr = &state;
279    int rc = epoll_ctl(m_epfd, op, fd, &epevent);
280    MORDOR_LOG_LEVEL(g_log, rc ? Log::ERROR : Log::VERBOSE) << this
281        << " epoll_ctl(" << m_epfd << ", " << (epoll_ctl_op_t)op << ", "
282        << fd << ", " << (EPOLL_EVENTS)epevent.events << "): " << rc
283        << " (" << lastError() << ")";
284    if (rc)
285        MORDOR_THROW_EXCEPTION_FROM_LAST_ERROR_API("epoll_ctl");
286    atomicIncrement(m_pendingEventCount);
287    state.m_events = (Event)(state.m_events | event);
288    AsyncState::EventContext &context = state.contextForEvent(event);
289    MORDOR_ASSERT(!context.scheduler);
290    MORDOR_ASSERT(!context.fiber);
291    MORDOR_ASSERT(!context.dg);
292    context.scheduler = Scheduler::getThis();
293    if (dg) {
294        context.dg.swap(dg);
295    } else {
296        context.fiber = Fiber::getThis();
297    }
298}
299
300bool
301IOManager::unregisterEvent(int fd, Event event)
302{
303    MORDOR_ASSERT(fd > 0);
304    MORDOR_ASSERT(event == READ || event == WRITE || event == CLOSE);
305
306    boost::mutex::scoped_lock lock(m_mutex);
307    if (m_pendingEvents.size() < (size_t)fd)
308        return false;
309    if (!m_pendingEvents[fd - 1])
310        return false;
311    AsyncState &state = *m_pendingEvents[fd - 1];
312    MORDOR_ASSERT(fd == state.m_fd);
313    lock.unlock();
314
315    boost::mutex::scoped_lock lock2(state.m_mutex);
316    if (!(state.m_events & event))
317        return false;
318
319    MORDOR_ASSERT(fd == state.m_fd);
320    Event newEvents = (Event)(state.m_events &~event);
321    int op = newEvents ? EPOLL_CTL_MOD : EPOLL_CTL_DEL;
322    epoll_event epevent;
323    epevent.events = EPOLLET | newEvents;
324    epevent.data.ptr = &state;
325    int rc = epoll_ctl(m_epfd, op, fd, &epevent);
326    MORDOR_LOG_LEVEL(g_log, rc ? Log::ERROR : Log::VERBOSE) << this
327        << " epoll_ctl(" << m_epfd << ", " << (epoll_ctl_op_t)op << ", "
328        << fd << ", " << (EPOLL_EVENTS)epevent.events << "): " << rc
329        << " (" << lastError() << ")";
330    if (rc)
331        MORDOR_THROW_EXCEPTION_FROM_LAST_ERROR_API("epoll_ctl");
332    atomicDecrement(m_pendingEventCount);
333    state.m_events = newEvents;
334    AsyncState::EventContext &context = state.contextForEvent(event);
335    // spawn a dedicated fiber to do the cleanup
336    state.resetContext(context);
337    return true;
338}
339
340bool
341IOManager::cancelEvent(int fd, Event event)
342{
343    MORDOR_ASSERT(fd > 0);
344    MORDOR_ASSERT(event == READ || event == WRITE || event == CLOSE);
345
346    boost::mutex::scoped_lock lock(m_mutex);
347    if (m_pendingEvents.size() < (size_t)fd)
348        return false;
349    if (!m_pendingEvents[fd - 1])
350        return false;
351    AsyncState &state = *m_pendingEvents[fd - 1];
352    MORDOR_ASSERT(fd == state.m_fd);
353    lock.unlock();
354
355    boost::mutex::scoped_lock lock2(state.m_mutex);
356    if (!(state.m_events & event))
357        return false;
358
359    MORDOR_ASSERT(fd == state.m_fd);
360    Event newEvents = (Event)(state.m_events &~event);
361    int op = newEvents ? EPOLL_CTL_MOD : EPOLL_CTL_DEL;
362    epoll_event epevent;
363    epevent.events = EPOLLET | newEvents;
364    epevent.data.ptr = &state;
365    int rc = epoll_ctl(m_epfd, op, fd, &epevent);
366    MORDOR_LOG_LEVEL(g_log, rc ? Log::ERROR : Log::VERBOSE) << this
367        << " epoll_ctl(" << m_epfd << ", " << (epoll_ctl_op_t)op << ", "
368        << fd << ", " << (EPOLL_EVENTS)epevent.events << "): " << rc
369        << " (" << lastError() << ")";
370    if (rc)
371        MORDOR_THROW_EXCEPTION_FROM_LAST_ERROR_API("epoll_ctl");
372    state.triggerEvent(event, m_pendingEventCount);
373    return true;
374}
375
376bool
377IOManager::stopping(unsigned long long &nextTimeout)
378{
379    nextTimeout = nextTimer();
380    return nextTimeout == ~0ull && Scheduler::stopping() &&
381        m_pendingEventCount == 0;
382}
383
384void
385IOManager::idle()
386{
387    epoll_event events[64];
388    while (true) {
389        unsigned long long nextTimeout;
390        if (stopping(nextTimeout))
391            return;
392        int rc;
393        int timeout;
394        do {
395            if (nextTimeout != ~0ull)
396                timeout = (int)(nextTimeout / 1000) + 1;
397            else
398                timeout = -1;
399            rc = epoll_wait(m_epfd, events, 64, timeout);
400            if (rc < 0 && errno == EINTR)
401                nextTimeout = nextTimer();
402            else
403                break;
404        } while (true);
405        MORDOR_LOG_LEVEL(g_log, rc < 0 ? Log::ERROR : Log::VERBOSE) << this
406            << " epoll_wait(" << m_epfd << ", 64, " << timeout << "): " << rc
407            << " (" << lastError() << ")";
408        if (rc < 0)
409            MORDOR_THROW_EXCEPTION_FROM_LAST_ERROR_API("epoll_wait");
410        std::vector<boost::function<void ()> > expired = processTimers();
411        if (!expired.empty()) {
412            schedule(expired.begin(), expired.end());
413            expired.clear();
414        }
415
416        boost::exception_ptr exception;
417        for(int i = 0; i < rc; ++i) {
418            epoll_event &event = events[i];
419            if (event.data.fd == m_tickleFds[0]) {
420                unsigned char dummy[256];
421                int rc2;
422                // every tickle write only 1 byte
423                // but it does not have to be read by 1 byte
424                // try to read more to save read() syscall
425                while((rc2 = read(m_tickleFds[0], dummy, 256)) > 0) {
426                    MORDOR_LOG_VERBOSE(g_log) << this << " received " << rc2 << " tickles";
427                }
428                MORDOR_VERIFY(rc2 < 0 && errno == EAGAIN);
429                continue;
430            }
431
432            AsyncState &state = *(AsyncState *)event.data.ptr;
433
434            boost::mutex::scoped_lock lock2(state.m_mutex);
435            MORDOR_LOG_TRACE(g_log) << " epoll_event {"
436                << (EPOLL_EVENTS)event.events << ", " << state.m_fd
437                << "}, registered for " << (EPOLL_EVENTS)state.m_events;
438
439            if (event.events & (EPOLLERR | EPOLLHUP))
440                event.events |= EPOLLIN | EPOLLOUT;
441
442            int incomingEvents = NONE;
443            if (event.events & EPOLLIN)
444                incomingEvents = READ;
445            if (event.events & EPOLLOUT)
446                incomingEvents |= WRITE;
447            if (event.events & EPOLLRDHUP)
448                incomingEvents |= CLOSE;
449
450            // Nothing will be triggered, probably because a prior cancelEvent call
451            // (probably on a different thread) already triggered it, so no
452            // need to tell epoll anything
453            if ((state.m_events & incomingEvents) == NONE)
454                continue;
455
456            int remainingEvents = (state.m_events & ~incomingEvents);
457            int op = remainingEvents ? EPOLL_CTL_MOD : EPOLL_CTL_DEL;
458            event.events = EPOLLET | remainingEvents;
459            int rc2 = epoll_ctl(m_epfd, op, state.m_fd, &event);
460            MORDOR_LOG_LEVEL(g_log, rc2 ? Log::ERROR : Log::VERBOSE) << this
461                << " epoll_ctl(" << m_epfd << ", " << (epoll_ctl_op_t)op << ", "
462                << state.m_fd << ", " << (EPOLL_EVENTS)event.events << "): " << rc2
463                << " (" << lastError() << ")";
464            if (rc2) {
465                try {
466                    MORDOR_THROW_EXCEPTION_FROM_LAST_ERROR_API("epoll_ctl");
467                } catch (boost::exception &) {
468                    exception = boost::current_exception();
469                    continue;
470                }
471            }
472            bool triggered = false;
473            if (incomingEvents & READ)
474                triggered = state.triggerEvent(READ, m_pendingEventCount);
475            if (incomingEvents & WRITE)
476                triggered = state.triggerEvent(WRITE, m_pendingEventCount) || triggered;
477            if (incomingEvents & CLOSE)
478                triggered = state.triggerEvent(CLOSE, m_pendingEventCount) || triggered;
479            MORDOR_ASSERT(triggered);
480        }
481        if (exception)
482            boost::rethrow_exception(exception);
483        try {
484            Fiber::yield();
485        } catch (OperationAbortedException &) {
486            return;
487        }
488    }
489}
490
491void
492IOManager::tickle()
493{
494    if (!hasIdleThreads()) {
495        MORDOR_LOG_VERBOSE(g_log) << this << " 0 idle thread, no tickle.";
496        return;
497    }
498    int rc = write(m_tickleFds[1], "T", 1);
499    MORDOR_LOG_VERBOSE(g_log) << this << " write(" << m_tickleFds[1] << ", 1): "
500        << rc << " (" << lastError() << ")";
501    MORDOR_VERIFY(rc == 1 || (rc < 0 && errno == EAGAIN));
502}
503
504}
505
506#endif