michael@0: /* michael@0: * Copyright 2000-2007 Niels Provos michael@0: * Copyright 2007-2012 Niels Provos, Nick Mathewson michael@0: * michael@0: * Redistribution and use in source and binary forms, with or without michael@0: * modification, are permitted provided that the following conditions michael@0: * are met: michael@0: * 1. Redistributions of source code must retain the above copyright michael@0: * notice, this list of conditions and the following disclaimer. michael@0: * 2. Redistributions in binary form must reproduce the above copyright michael@0: * notice, this list of conditions and the following disclaimer in the michael@0: * documentation and/or other materials provided with the distribution. michael@0: * 3. The name of the author may not be used to endorse or promote products michael@0: * derived from this software without specific prior written permission. michael@0: * michael@0: * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR michael@0: * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES michael@0: * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. michael@0: * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, michael@0: * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT michael@0: * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, michael@0: * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY michael@0: * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT michael@0: * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF michael@0: * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. michael@0: */ michael@0: #include "event2/event-config.h" michael@0: michael@0: #include michael@0: #include michael@0: #include michael@0: #ifdef _EVENT_HAVE_SYS_TIME_H michael@0: #include michael@0: #endif michael@0: #include michael@0: #include michael@0: #include michael@0: #include michael@0: #include michael@0: #include michael@0: #include michael@0: #include michael@0: #include michael@0: #ifdef _EVENT_HAVE_FCNTL_H michael@0: #include michael@0: #endif michael@0: michael@0: #include "event-internal.h" michael@0: #include "evsignal-internal.h" michael@0: #include "event2/thread.h" michael@0: #include "evthread-internal.h" michael@0: #include "log-internal.h" michael@0: #include "evmap-internal.h" michael@0: #include "changelist-internal.h" michael@0: michael@0: struct epollop { michael@0: struct epoll_event *events; michael@0: int nevents; michael@0: int epfd; michael@0: }; michael@0: michael@0: static void *epoll_init(struct event_base *); michael@0: static int epoll_dispatch(struct event_base *, struct timeval *); michael@0: static void epoll_dealloc(struct event_base *); michael@0: michael@0: static const struct eventop epollops_changelist = { michael@0: "epoll (with changelist)", michael@0: epoll_init, michael@0: event_changelist_add, michael@0: event_changelist_del, michael@0: epoll_dispatch, michael@0: epoll_dealloc, michael@0: 1, /* need reinit */ michael@0: EV_FEATURE_ET|EV_FEATURE_O1, michael@0: EVENT_CHANGELIST_FDINFO_SIZE michael@0: }; michael@0: michael@0: michael@0: static int epoll_nochangelist_add(struct event_base *base, evutil_socket_t fd, michael@0: short old, short events, void *p); michael@0: static int epoll_nochangelist_del(struct event_base *base, evutil_socket_t fd, michael@0: short old, short events, void *p); michael@0: michael@0: const struct eventop epollops = { michael@0: "epoll", michael@0: epoll_init, michael@0: epoll_nochangelist_add, michael@0: epoll_nochangelist_del, michael@0: epoll_dispatch, michael@0: epoll_dealloc, michael@0: 1, /* need reinit */ michael@0: EV_FEATURE_ET|EV_FEATURE_O1, michael@0: 0 michael@0: }; michael@0: michael@0: #define INITIAL_NEVENT 32 michael@0: #define MAX_NEVENT 4096 michael@0: michael@0: /* On Linux kernels at least up to 2.6.24.4, epoll can't handle timeout michael@0: * values bigger than (LONG_MAX - 999ULL)/HZ. HZ in the wild can be michael@0: * as big as 1000, and LONG_MAX can be as small as (1<<31)-1, so the michael@0: * largest number of msec we can support here is 2147482. Let's michael@0: * round that down by 47 seconds. michael@0: */ michael@0: #define MAX_EPOLL_TIMEOUT_MSEC (35*60*1000) michael@0: michael@0: static void * michael@0: epoll_init(struct event_base *base) michael@0: { michael@0: int epfd; michael@0: struct epollop *epollop; michael@0: michael@0: /* Initialize the kernel queue. (The size field is ignored since michael@0: * 2.6.8.) */ michael@0: if ((epfd = epoll_create(32000)) == -1) { michael@0: if (errno != ENOSYS) michael@0: event_warn("epoll_create"); michael@0: return (NULL); michael@0: } michael@0: michael@0: evutil_make_socket_closeonexec(epfd); michael@0: michael@0: if (!(epollop = mm_calloc(1, sizeof(struct epollop)))) { michael@0: close(epfd); michael@0: return (NULL); michael@0: } michael@0: michael@0: epollop->epfd = epfd; michael@0: michael@0: /* Initialize fields */ michael@0: epollop->events = mm_calloc(INITIAL_NEVENT, sizeof(struct epoll_event)); michael@0: if (epollop->events == NULL) { michael@0: mm_free(epollop); michael@0: close(epfd); michael@0: return (NULL); michael@0: } michael@0: epollop->nevents = INITIAL_NEVENT; michael@0: michael@0: if ((base->flags & EVENT_BASE_FLAG_EPOLL_USE_CHANGELIST) != 0 || michael@0: ((base->flags & EVENT_BASE_FLAG_IGNORE_ENV) == 0 && michael@0: evutil_getenv("EVENT_EPOLL_USE_CHANGELIST") != NULL)) michael@0: base->evsel = &epollops_changelist; michael@0: michael@0: evsig_init(base); michael@0: michael@0: return (epollop); michael@0: } michael@0: michael@0: static const char * michael@0: change_to_string(int change) michael@0: { michael@0: change &= (EV_CHANGE_ADD|EV_CHANGE_DEL); michael@0: if (change == EV_CHANGE_ADD) { michael@0: return "add"; michael@0: } else if (change == EV_CHANGE_DEL) { michael@0: return "del"; michael@0: } else if (change == 0) { michael@0: return "none"; michael@0: } else { michael@0: return "???"; michael@0: } michael@0: } michael@0: michael@0: static const char * michael@0: epoll_op_to_string(int op) michael@0: { michael@0: return op == EPOLL_CTL_ADD?"ADD": michael@0: op == EPOLL_CTL_DEL?"DEL": michael@0: op == EPOLL_CTL_MOD?"MOD": michael@0: "???"; michael@0: } michael@0: michael@0: static int michael@0: epoll_apply_one_change(struct event_base *base, michael@0: struct epollop *epollop, michael@0: const struct event_change *ch) michael@0: { michael@0: struct epoll_event epev; michael@0: int op, events = 0; michael@0: michael@0: if (1) { michael@0: /* The logic here is a little tricky. If we had no events set michael@0: on the fd before, we need to set op="ADD" and set michael@0: events=the events we want to add. If we had any events set michael@0: on the fd before, and we want any events to remain on the michael@0: fd, we need to say op="MOD" and set events=the events we michael@0: want to remain. But if we want to delete the last event, michael@0: we say op="DEL" and set events=the remaining events. What michael@0: fun! michael@0: */ michael@0: michael@0: /* TODO: Turn this into a switch or a table lookup. */ michael@0: michael@0: if ((ch->read_change & EV_CHANGE_ADD) || michael@0: (ch->write_change & EV_CHANGE_ADD)) { michael@0: /* If we are adding anything at all, we'll want to do michael@0: * either an ADD or a MOD. */ michael@0: events = 0; michael@0: op = EPOLL_CTL_ADD; michael@0: if (ch->read_change & EV_CHANGE_ADD) { michael@0: events |= EPOLLIN; michael@0: } else if (ch->read_change & EV_CHANGE_DEL) { michael@0: ; michael@0: } else if (ch->old_events & EV_READ) { michael@0: events |= EPOLLIN; michael@0: } michael@0: if (ch->write_change & EV_CHANGE_ADD) { michael@0: events |= EPOLLOUT; michael@0: } else if (ch->write_change & EV_CHANGE_DEL) { michael@0: ; michael@0: } else if (ch->old_events & EV_WRITE) { michael@0: events |= EPOLLOUT; michael@0: } michael@0: if ((ch->read_change|ch->write_change) & EV_ET) michael@0: events |= EPOLLET; michael@0: michael@0: if (ch->old_events) { michael@0: /* If MOD fails, we retry as an ADD, and if michael@0: * ADD fails we will retry as a MOD. So the michael@0: * only hard part here is to guess which one michael@0: * will work. As a heuristic, we'll try michael@0: * MOD first if we think there were old michael@0: * events and ADD if we think there were none. michael@0: * michael@0: * We can be wrong about the MOD if the file michael@0: * has in fact been closed and re-opened. michael@0: * michael@0: * We can be wrong about the ADD if the michael@0: * the fd has been re-created with a dup() michael@0: * of the same file that it was before. michael@0: */ michael@0: op = EPOLL_CTL_MOD; michael@0: } michael@0: } else if ((ch->read_change & EV_CHANGE_DEL) || michael@0: (ch->write_change & EV_CHANGE_DEL)) { michael@0: /* If we're deleting anything, we'll want to do a MOD michael@0: * or a DEL. */ michael@0: op = EPOLL_CTL_DEL; michael@0: michael@0: if (ch->read_change & EV_CHANGE_DEL) { michael@0: if (ch->write_change & EV_CHANGE_DEL) { michael@0: events = EPOLLIN|EPOLLOUT; michael@0: } else if (ch->old_events & EV_WRITE) { michael@0: events = EPOLLOUT; michael@0: op = EPOLL_CTL_MOD; michael@0: } else { michael@0: events = EPOLLIN; michael@0: } michael@0: } else if (ch->write_change & EV_CHANGE_DEL) { michael@0: if (ch->old_events & EV_READ) { michael@0: events = EPOLLIN; michael@0: op = EPOLL_CTL_MOD; michael@0: } else { michael@0: events = EPOLLOUT; michael@0: } michael@0: } michael@0: } michael@0: michael@0: if (!events) michael@0: return 0; michael@0: michael@0: memset(&epev, 0, sizeof(epev)); michael@0: epev.data.fd = ch->fd; michael@0: epev.events = events; michael@0: if (epoll_ctl(epollop->epfd, op, ch->fd, &epev) == -1) { michael@0: if (op == EPOLL_CTL_MOD && errno == ENOENT) { michael@0: /* If a MOD operation fails with ENOENT, the michael@0: * fd was probably closed and re-opened. We michael@0: * should retry the operation as an ADD. michael@0: */ michael@0: if (epoll_ctl(epollop->epfd, EPOLL_CTL_ADD, ch->fd, &epev) == -1) { michael@0: event_warn("Epoll MOD(%d) on %d retried as ADD; that failed too", michael@0: (int)epev.events, ch->fd); michael@0: return -1; michael@0: } else { michael@0: event_debug(("Epoll MOD(%d) on %d retried as ADD; succeeded.", michael@0: (int)epev.events, michael@0: ch->fd)); michael@0: } michael@0: } else if (op == EPOLL_CTL_ADD && errno == EEXIST) { michael@0: /* If an ADD operation fails with EEXIST, michael@0: * either the operation was redundant (as with a michael@0: * precautionary add), or we ran into a fun michael@0: * kernel bug where using dup*() to duplicate the michael@0: * same file into the same fd gives you the same epitem michael@0: * rather than a fresh one. For the second case, michael@0: * we must retry with MOD. */ michael@0: if (epoll_ctl(epollop->epfd, EPOLL_CTL_MOD, ch->fd, &epev) == -1) { michael@0: event_warn("Epoll ADD(%d) on %d retried as MOD; that failed too", michael@0: (int)epev.events, ch->fd); michael@0: return -1; michael@0: } else { michael@0: event_debug(("Epoll ADD(%d) on %d retried as MOD; succeeded.", michael@0: (int)epev.events, michael@0: ch->fd)); michael@0: } michael@0: } else if (op == EPOLL_CTL_DEL && michael@0: (errno == ENOENT || errno == EBADF || michael@0: errno == EPERM)) { michael@0: /* If a delete fails with one of these errors, michael@0: * that's fine too: we closed the fd before we michael@0: * got around to calling epoll_dispatch. */ michael@0: event_debug(("Epoll DEL(%d) on fd %d gave %s: DEL was unnecessary.", michael@0: (int)epev.events, michael@0: ch->fd, michael@0: strerror(errno))); michael@0: } else { michael@0: event_warn("Epoll %s(%d) on fd %d failed. Old events were %d; read change was %d (%s); write change was %d (%s)", michael@0: epoll_op_to_string(op), michael@0: (int)epev.events, michael@0: ch->fd, michael@0: ch->old_events, michael@0: ch->read_change, michael@0: change_to_string(ch->read_change), michael@0: ch->write_change, michael@0: change_to_string(ch->write_change)); michael@0: return -1; michael@0: } michael@0: } else { michael@0: event_debug(("Epoll %s(%d) on fd %d okay. [old events were %d; read change was %d; write change was %d]", michael@0: epoll_op_to_string(op), michael@0: (int)epev.events, michael@0: (int)ch->fd, michael@0: ch->old_events, michael@0: ch->read_change, michael@0: ch->write_change)); michael@0: } michael@0: } michael@0: return 0; michael@0: } michael@0: michael@0: static int michael@0: epoll_apply_changes(struct event_base *base) michael@0: { michael@0: struct event_changelist *changelist = &base->changelist; michael@0: struct epollop *epollop = base->evbase; michael@0: struct event_change *ch; michael@0: michael@0: int r = 0; michael@0: int i; michael@0: michael@0: for (i = 0; i < changelist->n_changes; ++i) { michael@0: ch = &changelist->changes[i]; michael@0: if (epoll_apply_one_change(base, epollop, ch) < 0) michael@0: r = -1; michael@0: } michael@0: michael@0: return (r); michael@0: } michael@0: michael@0: static int michael@0: epoll_nochangelist_add(struct event_base *base, evutil_socket_t fd, michael@0: short old, short events, void *p) michael@0: { michael@0: struct event_change ch; michael@0: ch.fd = fd; michael@0: ch.old_events = old; michael@0: ch.read_change = ch.write_change = 0; michael@0: if (events & EV_WRITE) michael@0: ch.write_change = EV_CHANGE_ADD | michael@0: (events & EV_ET); michael@0: if (events & EV_READ) michael@0: ch.read_change = EV_CHANGE_ADD | michael@0: (events & EV_ET); michael@0: michael@0: return epoll_apply_one_change(base, base->evbase, &ch); michael@0: } michael@0: michael@0: static int michael@0: epoll_nochangelist_del(struct event_base *base, evutil_socket_t fd, michael@0: short old, short events, void *p) michael@0: { michael@0: struct event_change ch; michael@0: ch.fd = fd; michael@0: ch.old_events = old; michael@0: ch.read_change = ch.write_change = 0; michael@0: if (events & EV_WRITE) michael@0: ch.write_change = EV_CHANGE_DEL; michael@0: if (events & EV_READ) michael@0: ch.read_change = EV_CHANGE_DEL; michael@0: michael@0: return epoll_apply_one_change(base, base->evbase, &ch); michael@0: } michael@0: michael@0: static int michael@0: epoll_dispatch(struct event_base *base, struct timeval *tv) michael@0: { michael@0: struct epollop *epollop = base->evbase; michael@0: struct epoll_event *events = epollop->events; michael@0: int i, res; michael@0: long timeout = -1; michael@0: michael@0: if (tv != NULL) { michael@0: timeout = evutil_tv_to_msec(tv); michael@0: if (timeout < 0 || timeout > MAX_EPOLL_TIMEOUT_MSEC) { michael@0: /* Linux kernels can wait forever if the timeout is michael@0: * too big; see comment on MAX_EPOLL_TIMEOUT_MSEC. */ michael@0: timeout = MAX_EPOLL_TIMEOUT_MSEC; michael@0: } michael@0: } michael@0: michael@0: epoll_apply_changes(base); michael@0: event_changelist_remove_all(&base->changelist, base); michael@0: michael@0: EVBASE_RELEASE_LOCK(base, th_base_lock); michael@0: michael@0: res = epoll_wait(epollop->epfd, events, epollop->nevents, timeout); michael@0: michael@0: EVBASE_ACQUIRE_LOCK(base, th_base_lock); michael@0: michael@0: if (res == -1) { michael@0: if (errno != EINTR) { michael@0: event_warn("epoll_wait"); michael@0: return (-1); michael@0: } michael@0: michael@0: return (0); michael@0: } michael@0: michael@0: event_debug(("%s: epoll_wait reports %d", __func__, res)); michael@0: EVUTIL_ASSERT(res <= epollop->nevents); michael@0: michael@0: for (i = 0; i < res; i++) { michael@0: int what = events[i].events; michael@0: short ev = 0; michael@0: michael@0: if (what & (EPOLLHUP|EPOLLERR)) { michael@0: ev = EV_READ | EV_WRITE; michael@0: } else { michael@0: if (what & EPOLLIN) michael@0: ev |= EV_READ; michael@0: if (what & EPOLLOUT) michael@0: ev |= EV_WRITE; michael@0: } michael@0: michael@0: if (!ev) michael@0: continue; michael@0: michael@0: evmap_io_active(base, events[i].data.fd, ev | EV_ET); michael@0: } michael@0: michael@0: if (res == epollop->nevents && epollop->nevents < MAX_NEVENT) { michael@0: /* We used all of the event space this time. We should michael@0: be ready for more events next time. */ michael@0: int new_nevents = epollop->nevents * 2; michael@0: struct epoll_event *new_events; michael@0: michael@0: new_events = mm_realloc(epollop->events, michael@0: new_nevents * sizeof(struct epoll_event)); michael@0: if (new_events) { michael@0: epollop->events = new_events; michael@0: epollop->nevents = new_nevents; michael@0: } michael@0: } michael@0: michael@0: return (0); michael@0: } michael@0: michael@0: michael@0: static void michael@0: epoll_dealloc(struct event_base *base) michael@0: { michael@0: struct epollop *epollop = base->evbase; michael@0: michael@0: evsig_dealloc(base); michael@0: if (epollop->events) michael@0: mm_free(epollop->events); michael@0: if (epollop->epfd >= 0) michael@0: close(epollop->epfd); michael@0: michael@0: memset(epollop, 0, sizeof(struct epollop)); michael@0: mm_free(epollop); michael@0: }