/contrib/bind9/lib/isc/unix/socket.c
https://bitbucket.org/freebsd/freebsd-head/ · C · 5944 lines · 4372 code · 781 blank · 791 comment · 886 complexity · 2bc52ee0f9ebc7e39a20f5b2ce900a03 MD5 · raw file
Large files are truncated click here to view the full file
- /*
- * Copyright (C) 2004-2012 Internet Systems Consortium, Inc. ("ISC")
- * Copyright (C) 1998-2003 Internet Software Consortium.
- *
- * Permission to use, copy, modify, and/or distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
- * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
- * AND FITNESS. IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
- * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
- * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
- * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
- * PERFORMANCE OF THIS SOFTWARE.
- */
- /* $Id$ */
- /*! \file */
- #include <config.h>
- #include <sys/param.h>
- #include <sys/types.h>
- #include <sys/socket.h>
- #include <sys/stat.h>
- #include <sys/time.h>
- #include <sys/uio.h>
- #include <errno.h>
- #include <fcntl.h>
- #include <stddef.h>
- #include <stdlib.h>
- #include <string.h>
- #include <unistd.h>
- #include <isc/buffer.h>
- #include <isc/bufferlist.h>
- #include <isc/condition.h>
- #include <isc/formatcheck.h>
- #include <isc/list.h>
- #include <isc/log.h>
- #include <isc/mem.h>
- #include <isc/msgs.h>
- #include <isc/mutex.h>
- #include <isc/net.h>
- #include <isc/once.h>
- #include <isc/platform.h>
- #include <isc/print.h>
- #include <isc/region.h>
- #include <isc/socket.h>
- #include <isc/stats.h>
- #include <isc/strerror.h>
- #include <isc/task.h>
- #include <isc/thread.h>
- #include <isc/util.h>
- #include <isc/xml.h>
- #ifdef ISC_PLATFORM_HAVESYSUNH
- #include <sys/un.h>
- #endif
- #ifdef ISC_PLATFORM_HAVEKQUEUE
- #include <sys/event.h>
- #endif
- #ifdef ISC_PLATFORM_HAVEEPOLL
- #include <sys/epoll.h>
- #endif
- #ifdef ISC_PLATFORM_HAVEDEVPOLL
- #if defined(HAVE_SYS_DEVPOLL_H)
- #include <sys/devpoll.h>
- #elif defined(HAVE_DEVPOLL_H)
- #include <devpoll.h>
- #endif
- #endif
- #include "errno2result.h"
- /* See task.c about the following definition: */
- #ifdef BIND9
- #ifdef ISC_PLATFORM_USETHREADS
- #define USE_WATCHER_THREAD
- #else
- #define USE_SHARED_MANAGER
- #endif /* ISC_PLATFORM_USETHREADS */
- #endif /* BIND9 */
- #ifndef USE_WATCHER_THREAD
- #include "socket_p.h"
- #include "../task_p.h"
- #endif /* USE_WATCHER_THREAD */
- #if defined(SO_BSDCOMPAT) && defined(__linux__)
- #include <sys/utsname.h>
- #endif
- /*%
- * Choose the most preferable multiplex method.
- */
- #ifdef ISC_PLATFORM_HAVEKQUEUE
- #define USE_KQUEUE
- #elif defined (ISC_PLATFORM_HAVEEPOLL)
- #define USE_EPOLL
- #elif defined (ISC_PLATFORM_HAVEDEVPOLL)
- #define USE_DEVPOLL
- typedef struct {
- unsigned int want_read : 1,
- want_write : 1;
- } pollinfo_t;
- #else
- #define USE_SELECT
- #endif /* ISC_PLATFORM_HAVEKQUEUE */
- #ifndef USE_WATCHER_THREAD
- #if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
- struct isc_socketwait {
- int nevents;
- };
- #elif defined (USE_SELECT)
- struct isc_socketwait {
- fd_set *readset;
- fd_set *writeset;
- int nfds;
- int maxfd;
- };
- #endif /* USE_KQUEUE */
- #endif /* !USE_WATCHER_THREAD */
- /*%
- * Maximum number of allowable open sockets. This is also the maximum
- * allowable socket file descriptor.
- *
- * Care should be taken before modifying this value for select():
- * The API standard doesn't ensure select() accept more than (the system default
- * of) FD_SETSIZE descriptors, and the default size should in fact be fine in
- * the vast majority of cases. This constant should therefore be increased only
- * when absolutely necessary and possible, i.e., the server is exhausting all
- * available file descriptors (up to FD_SETSIZE) and the select() function
- * and FD_xxx macros support larger values than FD_SETSIZE (which may not
- * always by true, but we keep using some of them to ensure as much
- * portability as possible). Note also that overall server performance
- * may be rather worsened with a larger value of this constant due to
- * inherent scalability problems of select().
- *
- * As a special note, this value shouldn't have to be touched if
- * this is a build for an authoritative only DNS server.
- */
- #ifndef ISC_SOCKET_MAXSOCKETS
- #if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
- #define ISC_SOCKET_MAXSOCKETS 4096
- #elif defined(USE_SELECT)
- #define ISC_SOCKET_MAXSOCKETS FD_SETSIZE
- #endif /* USE_KQUEUE... */
- #endif /* ISC_SOCKET_MAXSOCKETS */
- #ifdef USE_SELECT
- /*%
- * Mac OS X needs a special definition to support larger values in select().
- * We always define this because a larger value can be specified run-time.
- */
- #ifdef __APPLE__
- #define _DARWIN_UNLIMITED_SELECT
- #endif /* __APPLE__ */
- #endif /* USE_SELECT */
- #ifdef ISC_SOCKET_USE_POLLWATCH
- /*%
- * If this macro is defined, enable workaround for a Solaris /dev/poll kernel
- * bug: DP_POLL ioctl could keep sleeping even if socket I/O is possible for
- * some of the specified FD. The idea is based on the observation that it's
- * likely for a busy server to keep receiving packets. It specifically works
- * as follows: the socket watcher is first initialized with the state of
- * "poll_idle". While it's in the idle state it keeps sleeping until a socket
- * event occurs. When it wakes up for a socket I/O event, it moves to the
- * poll_active state, and sets the poll timeout to a short period
- * (ISC_SOCKET_POLLWATCH_TIMEOUT msec). If timeout occurs in this state, the
- * watcher goes to the poll_checking state with the same timeout period.
- * In this state, the watcher tries to detect whether this is a break
- * during intermittent events or the kernel bug is triggered. If the next
- * polling reports an event within the short period, the previous timeout is
- * likely to be a kernel bug, and so the watcher goes back to the active state.
- * Otherwise, it moves to the idle state again.
- *
- * It's not clear whether this is a thread-related bug, but since we've only
- * seen this with threads, this workaround is used only when enabling threads.
- */
- typedef enum { poll_idle, poll_active, poll_checking } pollstate_t;
- #ifndef ISC_SOCKET_POLLWATCH_TIMEOUT
- #define ISC_SOCKET_POLLWATCH_TIMEOUT 10
- #endif /* ISC_SOCKET_POLLWATCH_TIMEOUT */
- #endif /* ISC_SOCKET_USE_POLLWATCH */
- /*%
- * Size of per-FD lock buckets.
- */
- #ifdef ISC_PLATFORM_USETHREADS
- #define FDLOCK_COUNT 1024
- #define FDLOCK_ID(fd) ((fd) % FDLOCK_COUNT)
- #else
- #define FDLOCK_COUNT 1
- #define FDLOCK_ID(fd) 0
- #endif /* ISC_PLATFORM_USETHREADS */
- /*%
- * Maximum number of events communicated with the kernel. There should normally
- * be no need for having a large number.
- */
- #if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
- #ifndef ISC_SOCKET_MAXEVENTS
- #define ISC_SOCKET_MAXEVENTS 64
- #endif
- #endif
- /*%
- * Some systems define the socket length argument as an int, some as size_t,
- * some as socklen_t. This is here so it can be easily changed if needed.
- */
- #ifndef ISC_SOCKADDR_LEN_T
- #define ISC_SOCKADDR_LEN_T unsigned int
- #endif
- /*%
- * Define what the possible "soft" errors can be. These are non-fatal returns
- * of various network related functions, like recv() and so on.
- *
- * For some reason, BSDI (and perhaps others) will sometimes return <0
- * from recv() but will have errno==0. This is broken, but we have to
- * work around it here.
- */
- #define SOFT_ERROR(e) ((e) == EAGAIN || \
- (e) == EWOULDBLOCK || \
- (e) == EINTR || \
- (e) == 0)
- #define DLVL(x) ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_DEBUG(x)
- /*!<
- * DLVL(90) -- Function entry/exit and other tracing.
- * DLVL(70) -- Socket "correctness" -- including returning of events, etc.
- * DLVL(60) -- Socket data send/receive
- * DLVL(50) -- Event tracing, including receiving/sending completion events.
- * DLVL(20) -- Socket creation/destruction.
- */
- #define TRACE_LEVEL 90
- #define CORRECTNESS_LEVEL 70
- #define IOEVENT_LEVEL 60
- #define EVENT_LEVEL 50
- #define CREATION_LEVEL 20
- #define TRACE DLVL(TRACE_LEVEL)
- #define CORRECTNESS DLVL(CORRECTNESS_LEVEL)
- #define IOEVENT DLVL(IOEVENT_LEVEL)
- #define EVENT DLVL(EVENT_LEVEL)
- #define CREATION DLVL(CREATION_LEVEL)
- typedef isc_event_t intev_t;
- #define SOCKET_MAGIC ISC_MAGIC('I', 'O', 'i', 'o')
- #define VALID_SOCKET(s) ISC_MAGIC_VALID(s, SOCKET_MAGIC)
- /*!
- * IPv6 control information. If the socket is an IPv6 socket we want
- * to collect the destination address and interface so the client can
- * set them on outgoing packets.
- */
- #ifdef ISC_PLATFORM_HAVEIN6PKTINFO
- #ifndef USE_CMSG
- #define USE_CMSG 1
- #endif
- #endif
- /*%
- * NetBSD and FreeBSD can timestamp packets. XXXMLG Should we have
- * a setsockopt() like interface to request timestamps, and if the OS
- * doesn't do it for us, call gettimeofday() on every UDP receive?
- */
- #ifdef SO_TIMESTAMP
- #ifndef USE_CMSG
- #define USE_CMSG 1
- #endif
- #endif
- /*%
- * The size to raise the receive buffer to (from BIND 8).
- */
- #define RCVBUFSIZE (32*1024)
- /*%
- * The number of times a send operation is repeated if the result is EINTR.
- */
- #define NRETRIES 10
- typedef struct isc__socket isc__socket_t;
- typedef struct isc__socketmgr isc__socketmgr_t;
- #define NEWCONNSOCK(ev) ((isc__socket_t *)(ev)->newsocket)
- struct isc__socket {
- /* Not locked. */
- isc_socket_t common;
- isc__socketmgr_t *manager;
- isc_mutex_t lock;
- isc_sockettype_t type;
- const isc_statscounter_t *statsindex;
- /* Locked by socket lock. */
- ISC_LINK(isc__socket_t) link;
- unsigned int references;
- int fd;
- int pf;
- char name[16];
- void * tag;
- ISC_LIST(isc_socketevent_t) send_list;
- ISC_LIST(isc_socketevent_t) recv_list;
- ISC_LIST(isc_socket_newconnev_t) accept_list;
- isc_socket_connev_t *connect_ev;
- /*
- * Internal events. Posted when a descriptor is readable or
- * writable. These are statically allocated and never freed.
- * They will be set to non-purgable before use.
- */
- intev_t readable_ev;
- intev_t writable_ev;
- isc_sockaddr_t peer_address; /* remote address */
- unsigned int pending_recv : 1,
- pending_send : 1,
- pending_accept : 1,
- listener : 1, /* listener socket */
- connected : 1,
- connecting : 1, /* connect pending */
- bound : 1; /* bound to local addr */
- #ifdef ISC_NET_RECVOVERFLOW
- unsigned char overflow; /* used for MSG_TRUNC fake */
- #endif
- char *recvcmsgbuf;
- ISC_SOCKADDR_LEN_T recvcmsgbuflen;
- char *sendcmsgbuf;
- ISC_SOCKADDR_LEN_T sendcmsgbuflen;
- void *fdwatcharg;
- isc_sockfdwatch_t fdwatchcb;
- int fdwatchflags;
- isc_task_t *fdwatchtask;
- };
- #define SOCKET_MANAGER_MAGIC ISC_MAGIC('I', 'O', 'm', 'g')
- #define VALID_MANAGER(m) ISC_MAGIC_VALID(m, SOCKET_MANAGER_MAGIC)
- struct isc__socketmgr {
- /* Not locked. */
- isc_socketmgr_t common;
- isc_mem_t *mctx;
- isc_mutex_t lock;
- isc_mutex_t *fdlock;
- isc_stats_t *stats;
- #ifdef USE_KQUEUE
- int kqueue_fd;
- int nevents;
- struct kevent *events;
- #endif /* USE_KQUEUE */
- #ifdef USE_EPOLL
- int epoll_fd;
- int nevents;
- struct epoll_event *events;
- #endif /* USE_EPOLL */
- #ifdef USE_DEVPOLL
- int devpoll_fd;
- int nevents;
- struct pollfd *events;
- #endif /* USE_DEVPOLL */
- #ifdef USE_SELECT
- int fd_bufsize;
- #endif /* USE_SELECT */
- unsigned int maxsocks;
- #ifdef ISC_PLATFORM_USETHREADS
- int pipe_fds[2];
- #endif
- /* Locked by fdlock. */
- isc__socket_t **fds;
- int *fdstate;
- #ifdef USE_DEVPOLL
- pollinfo_t *fdpollinfo;
- #endif
- /* Locked by manager lock. */
- ISC_LIST(isc__socket_t) socklist;
- #ifdef USE_SELECT
- fd_set *read_fds;
- fd_set *read_fds_copy;
- fd_set *write_fds;
- fd_set *write_fds_copy;
- int maxfd;
- #endif /* USE_SELECT */
- int reserved; /* unlocked */
- #ifdef USE_WATCHER_THREAD
- isc_thread_t watcher;
- isc_condition_t shutdown_ok;
- #else /* USE_WATCHER_THREAD */
- unsigned int refs;
- #endif /* USE_WATCHER_THREAD */
- int maxudp;
- };
- #ifdef USE_SHARED_MANAGER
- static isc__socketmgr_t *socketmgr = NULL;
- #endif /* USE_SHARED_MANAGER */
- #define CLOSED 0 /* this one must be zero */
- #define MANAGED 1
- #define CLOSE_PENDING 2
- /*
- * send() and recv() iovec counts
- */
- #define MAXSCATTERGATHER_SEND (ISC_SOCKET_MAXSCATTERGATHER)
- #ifdef ISC_NET_RECVOVERFLOW
- # define MAXSCATTERGATHER_RECV (ISC_SOCKET_MAXSCATTERGATHER + 1)
- #else
- # define MAXSCATTERGATHER_RECV (ISC_SOCKET_MAXSCATTERGATHER)
- #endif
- static void send_recvdone_event(isc__socket_t *, isc_socketevent_t **);
- static void send_senddone_event(isc__socket_t *, isc_socketevent_t **);
- static void free_socket(isc__socket_t **);
- static isc_result_t allocate_socket(isc__socketmgr_t *, isc_sockettype_t,
- isc__socket_t **);
- static void destroy(isc__socket_t **);
- static void internal_accept(isc_task_t *, isc_event_t *);
- static void internal_connect(isc_task_t *, isc_event_t *);
- static void internal_recv(isc_task_t *, isc_event_t *);
- static void internal_send(isc_task_t *, isc_event_t *);
- static void internal_fdwatch_write(isc_task_t *, isc_event_t *);
- static void internal_fdwatch_read(isc_task_t *, isc_event_t *);
- static void process_cmsg(isc__socket_t *, struct msghdr *, isc_socketevent_t *);
- static void build_msghdr_send(isc__socket_t *, isc_socketevent_t *,
- struct msghdr *, struct iovec *, size_t *);
- static void build_msghdr_recv(isc__socket_t *, isc_socketevent_t *,
- struct msghdr *, struct iovec *, size_t *);
- #ifdef USE_WATCHER_THREAD
- static isc_boolean_t process_ctlfd(isc__socketmgr_t *manager);
- #endif
- /*%
- * The following can be either static or public, depending on build environment.
- */
- #ifdef BIND9
- #define ISC_SOCKETFUNC_SCOPE
- #else
- #define ISC_SOCKETFUNC_SCOPE static
- #endif
- ISC_SOCKETFUNC_SCOPE isc_result_t
- isc__socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
- isc_socket_t **socketp);
- ISC_SOCKETFUNC_SCOPE void
- isc__socket_attach(isc_socket_t *sock, isc_socket_t **socketp);
- ISC_SOCKETFUNC_SCOPE void
- isc__socket_detach(isc_socket_t **socketp);
- ISC_SOCKETFUNC_SCOPE isc_result_t
- isc__socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp);
- ISC_SOCKETFUNC_SCOPE isc_result_t
- isc__socketmgr_create2(isc_mem_t *mctx, isc_socketmgr_t **managerp,
- unsigned int maxsocks);
- ISC_SOCKETFUNC_SCOPE void
- isc__socketmgr_destroy(isc_socketmgr_t **managerp);
- ISC_SOCKETFUNC_SCOPE isc_result_t
- isc__socket_recvv(isc_socket_t *sock, isc_bufferlist_t *buflist,
- unsigned int minimum, isc_task_t *task,
- isc_taskaction_t action, const void *arg);
- ISC_SOCKETFUNC_SCOPE isc_result_t
- isc__socket_recv(isc_socket_t *sock, isc_region_t *region,
- unsigned int minimum, isc_task_t *task,
- isc_taskaction_t action, const void *arg);
- ISC_SOCKETFUNC_SCOPE isc_result_t
- isc__socket_recv2(isc_socket_t *sock, isc_region_t *region,
- unsigned int minimum, isc_task_t *task,
- isc_socketevent_t *event, unsigned int flags);
- ISC_SOCKETFUNC_SCOPE isc_result_t
- isc__socket_send(isc_socket_t *sock, isc_region_t *region,
- isc_task_t *task, isc_taskaction_t action, const void *arg);
- ISC_SOCKETFUNC_SCOPE isc_result_t
- isc__socket_sendto(isc_socket_t *sock, isc_region_t *region,
- isc_task_t *task, isc_taskaction_t action, const void *arg,
- isc_sockaddr_t *address, struct in6_pktinfo *pktinfo);
- ISC_SOCKETFUNC_SCOPE isc_result_t
- isc__socket_sendv(isc_socket_t *sock, isc_bufferlist_t *buflist,
- isc_task_t *task, isc_taskaction_t action, const void *arg);
- ISC_SOCKETFUNC_SCOPE isc_result_t
- isc__socket_sendtov(isc_socket_t *sock, isc_bufferlist_t *buflist,
- isc_task_t *task, isc_taskaction_t action, const void *arg,
- isc_sockaddr_t *address, struct in6_pktinfo *pktinfo);
- ISC_SOCKETFUNC_SCOPE isc_result_t
- isc__socket_sendto2(isc_socket_t *sock, isc_region_t *region,
- isc_task_t *task,
- isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
- isc_socketevent_t *event, unsigned int flags);
- ISC_SOCKETFUNC_SCOPE void
- isc__socket_cleanunix(isc_sockaddr_t *sockaddr, isc_boolean_t active);
- ISC_SOCKETFUNC_SCOPE isc_result_t
- isc__socket_permunix(isc_sockaddr_t *sockaddr, isc_uint32_t perm,
- isc_uint32_t owner, isc_uint32_t group);
- ISC_SOCKETFUNC_SCOPE isc_result_t
- isc__socket_bind(isc_socket_t *sock, isc_sockaddr_t *sockaddr,
- unsigned int options);
- ISC_SOCKETFUNC_SCOPE isc_result_t
- isc__socket_filter(isc_socket_t *sock, const char *filter);
- ISC_SOCKETFUNC_SCOPE isc_result_t
- isc__socket_listen(isc_socket_t *sock, unsigned int backlog);
- ISC_SOCKETFUNC_SCOPE isc_result_t
- isc__socket_accept(isc_socket_t *sock,
- isc_task_t *task, isc_taskaction_t action, const void *arg);
- ISC_SOCKETFUNC_SCOPE isc_result_t
- isc__socket_connect(isc_socket_t *sock, isc_sockaddr_t *addr,
- isc_task_t *task, isc_taskaction_t action,
- const void *arg);
- ISC_SOCKETFUNC_SCOPE isc_result_t
- isc__socket_getpeername(isc_socket_t *sock, isc_sockaddr_t *addressp);
- ISC_SOCKETFUNC_SCOPE isc_result_t
- isc__socket_getsockname(isc_socket_t *sock, isc_sockaddr_t *addressp);
- ISC_SOCKETFUNC_SCOPE void
- isc__socket_cancel(isc_socket_t *sock, isc_task_t *task, unsigned int how);
- ISC_SOCKETFUNC_SCOPE isc_sockettype_t
- isc__socket_gettype(isc_socket_t *sock);
- ISC_SOCKETFUNC_SCOPE isc_boolean_t
- isc__socket_isbound(isc_socket_t *sock);
- ISC_SOCKETFUNC_SCOPE void
- isc__socket_ipv6only(isc_socket_t *sock, isc_boolean_t yes);
- #if defined(HAVE_LIBXML2) && defined(BIND9)
- ISC_SOCKETFUNC_SCOPE void
- isc__socketmgr_renderxml(isc_socketmgr_t *mgr0, xmlTextWriterPtr writer);
- #endif
- ISC_SOCKETFUNC_SCOPE isc_result_t
- isc__socket_fdwatchcreate(isc_socketmgr_t *manager, int fd, int flags,
- isc_sockfdwatch_t callback, void *cbarg,
- isc_task_t *task, isc_socket_t **socketp);
- ISC_SOCKETFUNC_SCOPE isc_result_t
- isc__socket_fdwatchpoke(isc_socket_t *sock, int flags);
- static struct {
- isc_socketmethods_t methods;
- /*%
- * The following are defined just for avoiding unused static functions.
- */
- #ifndef BIND9
- void *recvv, *send, *sendv, *sendto2, *cleanunix, *permunix, *filter,
- *listen, *accept, *getpeername, *isbound;
- #endif
- } socketmethods = {
- {
- isc__socket_attach,
- isc__socket_detach,
- isc__socket_bind,
- isc__socket_sendto,
- isc__socket_connect,
- isc__socket_recv,
- isc__socket_cancel,
- isc__socket_getsockname,
- isc__socket_gettype,
- isc__socket_ipv6only,
- isc__socket_fdwatchpoke
- }
- #ifndef BIND9
- ,
- (void *)isc__socket_recvv, (void *)isc__socket_send,
- (void *)isc__socket_sendv, (void *)isc__socket_sendto2,
- (void *)isc__socket_cleanunix, (void *)isc__socket_permunix,
- (void *)isc__socket_filter, (void *)isc__socket_listen,
- (void *)isc__socket_accept, (void *)isc__socket_getpeername,
- (void *)isc__socket_isbound
- #endif
- };
- static isc_socketmgrmethods_t socketmgrmethods = {
- isc__socketmgr_destroy,
- isc__socket_create,
- isc__socket_fdwatchcreate
- };
- #define SELECT_POKE_SHUTDOWN (-1)
- #define SELECT_POKE_NOTHING (-2)
- #define SELECT_POKE_READ (-3)
- #define SELECT_POKE_ACCEPT (-3) /*%< Same as _READ */
- #define SELECT_POKE_WRITE (-4)
- #define SELECT_POKE_CONNECT (-4) /*%< Same as _WRITE */
- #define SELECT_POKE_CLOSE (-5)
- #define SOCK_DEAD(s) ((s)->references == 0)
- /*%
- * Shortcut index arrays to get access to statistics counters.
- */
- enum {
- STATID_OPEN = 0,
- STATID_OPENFAIL = 1,
- STATID_CLOSE = 2,
- STATID_BINDFAIL = 3,
- STATID_CONNECTFAIL = 4,
- STATID_CONNECT = 5,
- STATID_ACCEPTFAIL = 6,
- STATID_ACCEPT = 7,
- STATID_SENDFAIL = 8,
- STATID_RECVFAIL = 9
- };
- static const isc_statscounter_t upd4statsindex[] = {
- isc_sockstatscounter_udp4open,
- isc_sockstatscounter_udp4openfail,
- isc_sockstatscounter_udp4close,
- isc_sockstatscounter_udp4bindfail,
- isc_sockstatscounter_udp4connectfail,
- isc_sockstatscounter_udp4connect,
- -1,
- -1,
- isc_sockstatscounter_udp4sendfail,
- isc_sockstatscounter_udp4recvfail
- };
- static const isc_statscounter_t upd6statsindex[] = {
- isc_sockstatscounter_udp6open,
- isc_sockstatscounter_udp6openfail,
- isc_sockstatscounter_udp6close,
- isc_sockstatscounter_udp6bindfail,
- isc_sockstatscounter_udp6connectfail,
- isc_sockstatscounter_udp6connect,
- -1,
- -1,
- isc_sockstatscounter_udp6sendfail,
- isc_sockstatscounter_udp6recvfail
- };
- static const isc_statscounter_t tcp4statsindex[] = {
- isc_sockstatscounter_tcp4open,
- isc_sockstatscounter_tcp4openfail,
- isc_sockstatscounter_tcp4close,
- isc_sockstatscounter_tcp4bindfail,
- isc_sockstatscounter_tcp4connectfail,
- isc_sockstatscounter_tcp4connect,
- isc_sockstatscounter_tcp4acceptfail,
- isc_sockstatscounter_tcp4accept,
- isc_sockstatscounter_tcp4sendfail,
- isc_sockstatscounter_tcp4recvfail
- };
- static const isc_statscounter_t tcp6statsindex[] = {
- isc_sockstatscounter_tcp6open,
- isc_sockstatscounter_tcp6openfail,
- isc_sockstatscounter_tcp6close,
- isc_sockstatscounter_tcp6bindfail,
- isc_sockstatscounter_tcp6connectfail,
- isc_sockstatscounter_tcp6connect,
- isc_sockstatscounter_tcp6acceptfail,
- isc_sockstatscounter_tcp6accept,
- isc_sockstatscounter_tcp6sendfail,
- isc_sockstatscounter_tcp6recvfail
- };
- static const isc_statscounter_t unixstatsindex[] = {
- isc_sockstatscounter_unixopen,
- isc_sockstatscounter_unixopenfail,
- isc_sockstatscounter_unixclose,
- isc_sockstatscounter_unixbindfail,
- isc_sockstatscounter_unixconnectfail,
- isc_sockstatscounter_unixconnect,
- isc_sockstatscounter_unixacceptfail,
- isc_sockstatscounter_unixaccept,
- isc_sockstatscounter_unixsendfail,
- isc_sockstatscounter_unixrecvfail
- };
- static const isc_statscounter_t fdwatchstatsindex[] = {
- -1,
- -1,
- isc_sockstatscounter_fdwatchclose,
- isc_sockstatscounter_fdwatchbindfail,
- isc_sockstatscounter_fdwatchconnectfail,
- isc_sockstatscounter_fdwatchconnect,
- -1,
- -1,
- isc_sockstatscounter_fdwatchsendfail,
- isc_sockstatscounter_fdwatchrecvfail
- };
- #if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL) || \
- defined(USE_WATCHER_THREAD)
- static void
- manager_log(isc__socketmgr_t *sockmgr,
- isc_logcategory_t *category, isc_logmodule_t *module, int level,
- const char *fmt, ...) ISC_FORMAT_PRINTF(5, 6);
- static void
- manager_log(isc__socketmgr_t *sockmgr,
- isc_logcategory_t *category, isc_logmodule_t *module, int level,
- const char *fmt, ...)
- {
- char msgbuf[2048];
- va_list ap;
- if (! isc_log_wouldlog(isc_lctx, level))
- return;
- va_start(ap, fmt);
- vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
- va_end(ap);
- isc_log_write(isc_lctx, category, module, level,
- "sockmgr %p: %s", sockmgr, msgbuf);
- }
- #endif
- static void
- socket_log(isc__socket_t *sock, isc_sockaddr_t *address,
- isc_logcategory_t *category, isc_logmodule_t *module, int level,
- isc_msgcat_t *msgcat, int msgset, int message,
- const char *fmt, ...) ISC_FORMAT_PRINTF(9, 10);
- static void
- socket_log(isc__socket_t *sock, isc_sockaddr_t *address,
- isc_logcategory_t *category, isc_logmodule_t *module, int level,
- isc_msgcat_t *msgcat, int msgset, int message,
- const char *fmt, ...)
- {
- char msgbuf[2048];
- char peerbuf[ISC_SOCKADDR_FORMATSIZE];
- va_list ap;
- if (! isc_log_wouldlog(isc_lctx, level))
- return;
- va_start(ap, fmt);
- vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
- va_end(ap);
- if (address == NULL) {
- isc_log_iwrite(isc_lctx, category, module, level,
- msgcat, msgset, message,
- "socket %p: %s", sock, msgbuf);
- } else {
- isc_sockaddr_format(address, peerbuf, sizeof(peerbuf));
- isc_log_iwrite(isc_lctx, category, module, level,
- msgcat, msgset, message,
- "socket %p %s: %s", sock, peerbuf, msgbuf);
- }
- }
- #if defined(_AIX) && defined(ISC_NET_BSD44MSGHDR) && \
- defined(USE_CMSG) && defined(IPV6_RECVPKTINFO)
- /*
- * AIX has a kernel bug where IPV6_RECVPKTINFO gets cleared by
- * setting IPV6_V6ONLY.
- */
- static void
- FIX_IPV6_RECVPKTINFO(isc__socket_t *sock)
- {
- char strbuf[ISC_STRERRORSIZE];
- int on = 1;
- if (sock->pf != AF_INET6 || sock->type != isc_sockettype_udp)
- return;
- if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO,
- (void *)&on, sizeof(on)) < 0) {
- isc__strerror(errno, strbuf, sizeof(strbuf));
- UNEXPECTED_ERROR(__FILE__, __LINE__,
- "setsockopt(%d, IPV6_RECVPKTINFO) "
- "%s: %s", sock->fd,
- isc_msgcat_get(isc_msgcat,
- ISC_MSGSET_GENERAL,
- ISC_MSG_FAILED,
- "failed"),
- strbuf);
- }
- }
- #else
- #define FIX_IPV6_RECVPKTINFO(sock) (void)0
- #endif
- /*%
- * Increment socket-related statistics counters.
- */
- static inline void
- inc_stats(isc_stats_t *stats, isc_statscounter_t counterid) {
- REQUIRE(counterid != -1);
- if (stats != NULL)
- isc_stats_increment(stats, counterid);
- }
- static inline isc_result_t
- watch_fd(isc__socketmgr_t *manager, int fd, int msg) {
- isc_result_t result = ISC_R_SUCCESS;
- #ifdef USE_KQUEUE
- struct kevent evchange;
- memset(&evchange, 0, sizeof(evchange));
- if (msg == SELECT_POKE_READ)
- evchange.filter = EVFILT_READ;
- else
- evchange.filter = EVFILT_WRITE;
- evchange.flags = EV_ADD;
- evchange.ident = fd;
- if (kevent(manager->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0)
- result = isc__errno2result(errno);
- return (result);
- #elif defined(USE_EPOLL)
- struct epoll_event event;
- if (msg == SELECT_POKE_READ)
- event.events = EPOLLIN;
- else
- event.events = EPOLLOUT;
- memset(&event.data, 0, sizeof(event.data));
- event.data.fd = fd;
- if (epoll_ctl(manager->epoll_fd, EPOLL_CTL_ADD, fd, &event) == -1 &&
- errno != EEXIST) {
- result = isc__errno2result(errno);
- }
- return (result);
- #elif defined(USE_DEVPOLL)
- struct pollfd pfd;
- int lockid = FDLOCK_ID(fd);
- memset(&pfd, 0, sizeof(pfd));
- if (msg == SELECT_POKE_READ)
- pfd.events = POLLIN;
- else
- pfd.events = POLLOUT;
- pfd.fd = fd;
- pfd.revents = 0;
- LOCK(&manager->fdlock[lockid]);
- if (write(manager->devpoll_fd, &pfd, sizeof(pfd)) == -1)
- result = isc__errno2result(errno);
- else {
- if (msg == SELECT_POKE_READ)
- manager->fdpollinfo[fd].want_read = 1;
- else
- manager->fdpollinfo[fd].want_write = 1;
- }
- UNLOCK(&manager->fdlock[lockid]);
- return (result);
- #elif defined(USE_SELECT)
- LOCK(&manager->lock);
- if (msg == SELECT_POKE_READ)
- FD_SET(fd, manager->read_fds);
- if (msg == SELECT_POKE_WRITE)
- FD_SET(fd, manager->write_fds);
- UNLOCK(&manager->lock);
- return (result);
- #endif
- }
- static inline isc_result_t
- unwatch_fd(isc__socketmgr_t *manager, int fd, int msg) {
- isc_result_t result = ISC_R_SUCCESS;
- #ifdef USE_KQUEUE
- struct kevent evchange;
- memset(&evchange, 0, sizeof(evchange));
- if (msg == SELECT_POKE_READ)
- evchange.filter = EVFILT_READ;
- else
- evchange.filter = EVFILT_WRITE;
- evchange.flags = EV_DELETE;
- evchange.ident = fd;
- if (kevent(manager->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0)
- result = isc__errno2result(errno);
- return (result);
- #elif defined(USE_EPOLL)
- struct epoll_event event;
- if (msg == SELECT_POKE_READ)
- event.events = EPOLLIN;
- else
- event.events = EPOLLOUT;
- memset(&event.data, 0, sizeof(event.data));
- event.data.fd = fd;
- if (epoll_ctl(manager->epoll_fd, EPOLL_CTL_DEL, fd, &event) == -1 &&
- errno != ENOENT) {
- char strbuf[ISC_STRERRORSIZE];
- isc__strerror(errno, strbuf, sizeof(strbuf));
- UNEXPECTED_ERROR(__FILE__, __LINE__,
- "epoll_ctl(DEL), %d: %s", fd, strbuf);
- result = ISC_R_UNEXPECTED;
- }
- return (result);
- #elif defined(USE_DEVPOLL)
- struct pollfd pfds[2];
- size_t writelen = sizeof(pfds[0]);
- int lockid = FDLOCK_ID(fd);
- memset(pfds, 0, sizeof(pfds));
- pfds[0].events = POLLREMOVE;
- pfds[0].fd = fd;
- /*
- * Canceling read or write polling via /dev/poll is tricky. Since it
- * only provides a way of canceling per FD, we may need to re-poll the
- * socket for the other operation.
- */
- LOCK(&manager->fdlock[lockid]);
- if (msg == SELECT_POKE_READ &&
- manager->fdpollinfo[fd].want_write == 1) {
- pfds[1].events = POLLOUT;
- pfds[1].fd = fd;
- writelen += sizeof(pfds[1]);
- }
- if (msg == SELECT_POKE_WRITE &&
- manager->fdpollinfo[fd].want_read == 1) {
- pfds[1].events = POLLIN;
- pfds[1].fd = fd;
- writelen += sizeof(pfds[1]);
- }
- if (write(manager->devpoll_fd, pfds, writelen) == -1)
- result = isc__errno2result(errno);
- else {
- if (msg == SELECT_POKE_READ)
- manager->fdpollinfo[fd].want_read = 0;
- else
- manager->fdpollinfo[fd].want_write = 0;
- }
- UNLOCK(&manager->fdlock[lockid]);
- return (result);
- #elif defined(USE_SELECT)
- LOCK(&manager->lock);
- if (msg == SELECT_POKE_READ)
- FD_CLR(fd, manager->read_fds);
- else if (msg == SELECT_POKE_WRITE)
- FD_CLR(fd, manager->write_fds);
- UNLOCK(&manager->lock);
- return (result);
- #endif
- }
- static void
- wakeup_socket(isc__socketmgr_t *manager, int fd, int msg) {
- isc_result_t result;
- int lockid = FDLOCK_ID(fd);
- /*
- * This is a wakeup on a socket. If the socket is not in the
- * process of being closed, start watching it for either reads
- * or writes.
- */
- INSIST(fd >= 0 && fd < (int)manager->maxsocks);
- if (msg == SELECT_POKE_CLOSE) {
- /* No one should be updating fdstate, so no need to lock it */
- INSIST(manager->fdstate[fd] == CLOSE_PENDING);
- manager->fdstate[fd] = CLOSED;
- (void)unwatch_fd(manager, fd, SELECT_POKE_READ);
- (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
- (void)close(fd);
- return;
- }
- LOCK(&manager->fdlock[lockid]);
- if (manager->fdstate[fd] == CLOSE_PENDING) {
- UNLOCK(&manager->fdlock[lockid]);
- /*
- * We accept (and ignore) any error from unwatch_fd() as we are
- * closing the socket, hoping it doesn't leave dangling state in
- * the kernel.
- * Note that unwatch_fd() must be called after releasing the
- * fdlock; otherwise it could cause deadlock due to a lock order
- * reversal.
- */
- (void)unwatch_fd(manager, fd, SELECT_POKE_READ);
- (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
- return;
- }
- if (manager->fdstate[fd] != MANAGED) {
- UNLOCK(&manager->fdlock[lockid]);
- return;
- }
- UNLOCK(&manager->fdlock[lockid]);
- /*
- * Set requested bit.
- */
- result = watch_fd(manager, fd, msg);
- if (result != ISC_R_SUCCESS) {
- /*
- * XXXJT: what should we do? Ignoring the failure of watching
- * a socket will make the application dysfunctional, but there
- * seems to be no reasonable recovery process.
- */
- isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
- ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
- "failed to start watching FD (%d): %s",
- fd, isc_result_totext(result));
- }
- }
- #ifdef USE_WATCHER_THREAD
- /*
- * Poke the select loop when there is something for us to do.
- * The write is required (by POSIX) to complete. That is, we
- * will not get partial writes.
- */
- static void
- select_poke(isc__socketmgr_t *mgr, int fd, int msg) {
- int cc;
- int buf[2];
- char strbuf[ISC_STRERRORSIZE];
- buf[0] = fd;
- buf[1] = msg;
- do {
- cc = write(mgr->pipe_fds[1], buf, sizeof(buf));
- #ifdef ENOSR
- /*
- * Treat ENOSR as EAGAIN but loop slowly as it is
- * unlikely to clear fast.
- */
- if (cc < 0 && errno == ENOSR) {
- sleep(1);
- errno = EAGAIN;
- }
- #endif
- } while (cc < 0 && SOFT_ERROR(errno));
- if (cc < 0) {
- isc__strerror(errno, strbuf, sizeof(strbuf));
- FATAL_ERROR(__FILE__, __LINE__,
- isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
- ISC_MSG_WRITEFAILED,
- "write() failed "
- "during watcher poke: %s"),
- strbuf);
- }
- INSIST(cc == sizeof(buf));
- }
- /*
- * Read a message on the internal fd.
- */
- static void
- select_readmsg(isc__socketmgr_t *mgr, int *fd, int *msg) {
- int buf[2];
- int cc;
- char strbuf[ISC_STRERRORSIZE];
- cc = read(mgr->pipe_fds[0], buf, sizeof(buf));
- if (cc < 0) {
- *msg = SELECT_POKE_NOTHING;
- *fd = -1; /* Silence compiler. */
- if (SOFT_ERROR(errno))
- return;
- isc__strerror(errno, strbuf, sizeof(strbuf));
- FATAL_ERROR(__FILE__, __LINE__,
- isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
- ISC_MSG_READFAILED,
- "read() failed "
- "during watcher poke: %s"),
- strbuf);
- return;
- }
- INSIST(cc == sizeof(buf));
- *fd = buf[0];
- *msg = buf[1];
- }
- #else /* USE_WATCHER_THREAD */
- /*
- * Update the state of the socketmgr when something changes.
- */
- static void
- select_poke(isc__socketmgr_t *manager, int fd, int msg) {
- if (msg == SELECT_POKE_SHUTDOWN)
- return;
- else if (fd >= 0)
- wakeup_socket(manager, fd, msg);
- return;
- }
- #endif /* USE_WATCHER_THREAD */
- /*
- * Make a fd non-blocking.
- */
- static isc_result_t
- make_nonblock(int fd) {
- int ret;
- int flags;
- char strbuf[ISC_STRERRORSIZE];
- #ifdef USE_FIONBIO_IOCTL
- int on = 1;
- ret = ioctl(fd, FIONBIO, (char *)&on);
- #else
- flags = fcntl(fd, F_GETFL, 0);
- flags |= PORT_NONBLOCK;
- ret = fcntl(fd, F_SETFL, flags);
- #endif
- if (ret == -1) {
- isc__strerror(errno, strbuf, sizeof(strbuf));
- UNEXPECTED_ERROR(__FILE__, __LINE__,
- #ifdef USE_FIONBIO_IOCTL
- "ioctl(%d, FIONBIO, &on): %s", fd,
- #else
- "fcntl(%d, F_SETFL, %d): %s", fd, flags,
- #endif
- strbuf);
- return (ISC_R_UNEXPECTED);
- }
- return (ISC_R_SUCCESS);
- }
- #ifdef USE_CMSG
- /*
- * Not all OSes support advanced CMSG macros: CMSG_LEN and CMSG_SPACE.
- * In order to ensure as much portability as possible, we provide wrapper
- * functions of these macros.
- * Note that cmsg_space() could run slow on OSes that do not have
- * CMSG_SPACE.
- */
- static inline ISC_SOCKADDR_LEN_T
- cmsg_len(ISC_SOCKADDR_LEN_T len) {
- #ifdef CMSG_LEN
- return (CMSG_LEN(len));
- #else
- ISC_SOCKADDR_LEN_T hdrlen;
- /*
- * Cast NULL so that any pointer arithmetic performed by CMSG_DATA
- * is correct.
- */
- hdrlen = (ISC_SOCKADDR_LEN_T)CMSG_DATA(((struct cmsghdr *)NULL));
- return (hdrlen + len);
- #endif
- }
- static inline ISC_SOCKADDR_LEN_T
- cmsg_space(ISC_SOCKADDR_LEN_T len) {
- #ifdef CMSG_SPACE
- return (CMSG_SPACE(len));
- #else
- struct msghdr msg;
- struct cmsghdr *cmsgp;
- /*
- * XXX: The buffer length is an ad-hoc value, but should be enough
- * in a practical sense.
- */
- char dummybuf[sizeof(struct cmsghdr) + 1024];
- memset(&msg, 0, sizeof(msg));
- msg.msg_control = dummybuf;
- msg.msg_controllen = sizeof(dummybuf);
- cmsgp = (struct cmsghdr *)dummybuf;
- cmsgp->cmsg_len = cmsg_len(len);
- cmsgp = CMSG_NXTHDR(&msg, cmsgp);
- if (cmsgp != NULL)
- return ((char *)cmsgp - (char *)msg.msg_control);
- else
- return (0);
- #endif
- }
- #endif /* USE_CMSG */
- /*
- * Process control messages received on a socket.
- */
- static void
- process_cmsg(isc__socket_t *sock, struct msghdr *msg, isc_socketevent_t *dev) {
- #ifdef USE_CMSG
- struct cmsghdr *cmsgp;
- #ifdef ISC_PLATFORM_HAVEIN6PKTINFO
- struct in6_pktinfo *pktinfop;
- #endif
- #ifdef SO_TIMESTAMP
- struct timeval *timevalp;
- #endif
- #endif
- /*
- * sock is used only when ISC_NET_BSD44MSGHDR and USE_CMSG are defined.
- * msg and dev are used only when ISC_NET_BSD44MSGHDR is defined.
- * They are all here, outside of the CPP tests, because it is
- * more consistent with the usual ISC coding style.
- */
- UNUSED(sock);
- UNUSED(msg);
- UNUSED(dev);
- #ifdef ISC_NET_BSD44MSGHDR
- #ifdef MSG_TRUNC
- if ((msg->msg_flags & MSG_TRUNC) == MSG_TRUNC)
- dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
- #endif
- #ifdef MSG_CTRUNC
- if ((msg->msg_flags & MSG_CTRUNC) == MSG_CTRUNC)
- dev->attributes |= ISC_SOCKEVENTATTR_CTRUNC;
- #endif
- #ifndef USE_CMSG
- return;
- #else
- if (msg->msg_controllen == 0U || msg->msg_control == NULL)
- return;
- #ifdef SO_TIMESTAMP
- timevalp = NULL;
- #endif
- #ifdef ISC_PLATFORM_HAVEIN6PKTINFO
- pktinfop = NULL;
- #endif
- cmsgp = CMSG_FIRSTHDR(msg);
- while (cmsgp != NULL) {
- socket_log(sock, NULL, TRACE,
- isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_PROCESSCMSG,
- "processing cmsg %p", cmsgp);
- #ifdef ISC_PLATFORM_HAVEIN6PKTINFO
- if (cmsgp->cmsg_level == IPPROTO_IPV6
- && cmsgp->cmsg_type == IPV6_PKTINFO) {
- pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
- memcpy(&dev->pktinfo, pktinfop,
- sizeof(struct in6_pktinfo));
- dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
- socket_log(sock, NULL, TRACE,
- isc_msgcat, ISC_MSGSET_SOCKET,
- ISC_MSG_IFRECEIVED,
- "interface received on ifindex %u",
- dev->pktinfo.ipi6_ifindex);
- if (IN6_IS_ADDR_MULTICAST(&pktinfop->ipi6_addr))
- dev->attributes |= ISC_SOCKEVENTATTR_MULTICAST;
- goto next;
- }
- #endif
- #ifdef SO_TIMESTAMP
- if (cmsgp->cmsg_level == SOL_SOCKET
- && cmsgp->cmsg_type == SCM_TIMESTAMP) {
- timevalp = (struct timeval *)CMSG_DATA(cmsgp);
- dev->timestamp.seconds = timevalp->tv_sec;
- dev->timestamp.nanoseconds = timevalp->tv_usec * 1000;
- dev->attributes |= ISC_SOCKEVENTATTR_TIMESTAMP;
- goto next;
- }
- #endif
- next:
- cmsgp = CMSG_NXTHDR(msg, cmsgp);
- }
- #endif /* USE_CMSG */
- #endif /* ISC_NET_BSD44MSGHDR */
- }
- /*
- * Construct an iov array and attach it to the msghdr passed in. This is
- * the SEND constructor, which will use the used region of the buffer
- * (if using a buffer list) or will use the internal region (if a single
- * buffer I/O is requested).
- *
- * Nothing can be NULL, and the done event must list at least one buffer
- * on the buffer linked list for this function to be meaningful.
- *
- * If write_countp != NULL, *write_countp will hold the number of bytes
- * this transaction can send.
- */
- static void
- build_msghdr_send(isc__socket_t *sock, isc_socketevent_t *dev,
- struct msghdr *msg, struct iovec *iov, size_t *write_countp)
- {
- unsigned int iovcount;
- isc_buffer_t *buffer;
- isc_region_t used;
- size_t write_count;
- size_t skip_count;
- memset(msg, 0, sizeof(*msg));
- if (!sock->connected) {
- msg->msg_name = (void *)&dev->address.type.sa;
- msg->msg_namelen = dev->address.length;
- } else {
- msg->msg_name = NULL;
- msg->msg_namelen = 0;
- }
- buffer = ISC_LIST_HEAD(dev->bufferlist);
- write_count = 0;
- iovcount = 0;
- /*
- * Single buffer I/O? Skip what we've done so far in this region.
- */
- if (buffer == NULL) {
- write_count = dev->region.length - dev->n;
- iov[0].iov_base = (void *)(dev->region.base + dev->n);
- iov[0].iov_len = write_count;
- iovcount = 1;
- goto config;
- }
- /*
- * Multibuffer I/O.
- * Skip the data in the buffer list that we have already written.
- */
- skip_count = dev->n;
- while (buffer != NULL) {
- REQUIRE(ISC_BUFFER_VALID(buffer));
- if (skip_count < isc_buffer_usedlength(buffer))
- break;
- skip_count -= isc_buffer_usedlength(buffer);
- buffer = ISC_LIST_NEXT(buffer, link);
- }
- while (buffer != NULL) {
- INSIST(iovcount < MAXSCATTERGATHER_SEND);
- isc_buffer_usedregion(buffer, &used);
- if (used.length > 0) {
- iov[iovcount].iov_base = (void *)(used.base
- + skip_count);
- iov[iovcount].iov_len = used.length - skip_count;
- write_count += (used.length - skip_count);
- skip_count = 0;
- iovcount++;
- }
- buffer = ISC_LIST_NEXT(buffer, link);
- }
- INSIST(skip_count == 0U);
- config:
- msg->msg_iov = iov;
- msg->msg_iovlen = iovcount;
- #ifdef ISC_NET_BSD44MSGHDR
- msg->msg_control = NULL;
- msg->msg_controllen = 0;
- msg->msg_flags = 0;
- #if defined(USE_CMSG) && defined(ISC_PLATFORM_HAVEIN6PKTINFO)
- if ((sock->type == isc_sockettype_udp)
- && ((dev->attributes & ISC_SOCKEVENTATTR_PKTINFO) != 0)) {
- #if defined(IPV6_USE_MIN_MTU)
- int use_min_mtu = 1; /* -1, 0, 1 */
- #endif
- struct cmsghdr *cmsgp;
- struct in6_pktinfo *pktinfop;
- socket_log(sock, NULL, TRACE,
- isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_SENDTODATA,
- "sendto pktinfo data, ifindex %u",
- dev->pktinfo.ipi6_ifindex);
- msg->msg_controllen = cmsg_space(sizeof(struct in6_pktinfo));
- INSIST(msg->msg_controllen <= sock->sendcmsgbuflen);
- msg->msg_control = (void *)sock->sendcmsgbuf;
- cmsgp = (struct cmsghdr *)sock->sendcmsgbuf;
- cmsgp->cmsg_level = IPPROTO_IPV6;
- cmsgp->cmsg_type = IPV6_PKTINFO;
- cmsgp->cmsg_len = cmsg_len(sizeof(struct in6_pktinfo));
- pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
- memcpy(pktinfop, &dev->pktinfo, sizeof(struct in6_pktinfo));
- #if defined(IPV6_USE_MIN_MTU)
- /*
- * Set IPV6_USE_MIN_MTU as a per packet option as FreeBSD
- * ignores setsockopt(IPV6_USE_MIN_MTU) when IPV6_PKTINFO
- * is used.
- */
- cmsgp = (struct cmsghdr *)(sock->sendcmsgbuf +
- msg->msg_controllen);
- msg->msg_controllen += cmsg_space(sizeof(use_min_mtu));
- INSIST(msg->msg_controllen <= sock->sendcmsgbuflen);
- cmsgp->cmsg_level = IPPROTO_IPV6;
- cmsgp->cmsg_type = IPV6_USE_MIN_MTU;
- cmsgp->cmsg_len = cmsg_len(sizeof(use_min_mtu));
- memcpy(CMSG_DATA(cmsgp), &use_min_mtu, sizeof(use_min_mtu));
- #endif
- }
- #endif /* USE_CMSG && ISC_PLATFORM_HAVEIPV6 */
- #else /* ISC_NET_BSD44MSGHDR */
- msg->msg_accrights = NULL;
- msg->msg_accrightslen = 0;
- #endif /* ISC_NET_BSD44MSGHDR */
- if (write_countp != NULL)
- *write_countp = write_count;
- }
- /*
- * Construct an iov array and attach it to the msghdr passed in. This is
- * the RECV constructor, which will use the available region of the buffer
- * (if using a buffer list) or will use the internal region (if a single
- * buffer I/O is requested).
- *
- * Nothing can be NULL, and the done event must list at least one buffer
- * on the buffer linked list for this function to be meaningful.
- *
- * If read_countp != NULL, *read_countp will hold the number of bytes
- * this transaction can receive.
- */
- static void
- build_msghdr_recv(isc__socket_t *sock, isc_socketevent_t *dev,
- struct msghdr *msg, struct iovec *iov, size_t *read_countp)
- {
- unsigned int iovcount;
- isc_buffer_t *buffer;
- isc_region_t available;
- size_t read_count;
- memset(msg, 0, sizeof(struct msghdr));
- if (sock->type == isc_sockettype_udp) {
- memset(&dev->address, 0, sizeof(dev->address));
- #ifdef BROKEN_RECVMSG
- if (sock->pf == AF_INET) {
- msg->msg_name = (void *)&dev->address.type.sin;
- msg->msg_namelen = sizeof(dev->address.type.sin6);
- } else if (sock->pf == AF_INET6) {
- msg->msg_name = (void *)&dev->address.type.sin6;
- msg->msg_namelen = sizeof(dev->address.type.sin6);
- #ifdef ISC_PLATFORM_HAVESYSUNH
- } else if (sock->pf == AF_UNIX) {
- msg->msg_name = (void *)&dev->address.type.sunix;
- msg->msg_namelen = sizeof(dev->address.type.sunix);
- #endif
- } else {
- msg->msg_name = (void *)&dev->address.type.sa;
- msg->msg_namelen = sizeof(dev->address.type);
- }
- #else
- msg->msg_name = (void *)&dev->address.type.sa;
- msg->msg_namelen = sizeof(dev->address.type);
- #endif
- #ifdef ISC_NET_RECVOVERFLOW
- /* If needed, steal one iovec for overflow detection. */
- maxiov--;
- #endif
- } else { /* TCP */
- msg->msg_name = NULL;
- msg->msg_namelen = 0;
- dev->address = sock->peer_address;
- }
- buffer = ISC_LIST_HEAD(dev->bufferlist);
- read_count = 0;
- /*
- * Single buffer I/O? Skip what we've done so far in this region.
- */
- if (buffer == NULL) {
- read_count = dev->region.length - dev->n;
- iov[0].iov_base = (void *)(dev->region.base + dev->n);
- iov[0].iov_len = read_count;
- iovcount = 1;
- goto config;
- }
- /*
- * Multibuffer I/O.
- * Skip empty buffers.
- */
- while (buffer != NULL) {
- REQUIRE(ISC_BUFFER_VALID(buffer));
- if (isc_buffer_availablelength(buffer) != 0)
- break;
- buffer = ISC_LIST_NEXT(buffer, link);
- }
- iovcount = 0;
- while (buffer != NULL) {
- INSIST(iovcount < MAXSCATTERGATHER_RECV);
- isc_buffer_availableregion(buffer, &available);
- if (available.length > 0) {
- iov[iovcount].iov_base = (void *)(available.base);
- iov[iovcount].iov_len = available.length;
- read_count += available.length;
- iovcount++;
- }
- buffer = ISC_LIST_NEXT(buffer, link);
- }
- config:
- /*
- * If needed, set up to receive that one extra byte. Note that
- * we know there is at least one iov left, since we stole it
- * at the top of this function.
- */
- #ifdef ISC_NET_RECVOVERFLOW
- if (sock->type == isc_sockettype_udp) {
- iov[iovcount].iov_base = (void *)(&sock->overflow);
- iov[iovcount].iov_len = 1;
- iovcount++;
- }
- #endif
- msg->msg_iov = iov;
- msg->msg_iovlen = iovcount;
- #ifdef ISC_NET_BSD44MSGHDR
- msg->msg_control = NULL;
- msg->msg_controllen = 0;
- msg->msg_flags = 0;
- #if defined(USE_CMSG)
- if (sock->type == isc_sockettype_udp) {
- msg->msg_control = sock->recvcmsgbuf;
- msg->msg_controllen = sock->recvcmsgbuflen;
- }
- #endif /* USE_CMSG */
- #else /* ISC_NET_BSD44MSGHDR */
- msg->msg_accrights = NULL;
- msg->msg_accrightslen = 0;
- #endif /* ISC_NET_BSD44MSGHDR */
- if (read_countp != NULL)
- *read_countp = read_count;
- }
- static void
- set_dev_address(isc_sockaddr_t *address, isc__socket_t *sock,
- isc_socketevent_t *dev)
- {
- if (sock->type == isc_sockettype_udp) {
- if (address != NULL)
- dev->address = *address;
- else
- dev->address = sock->peer_address;
- } else if (sock->type == isc_sockettype_tcp) {
- INSIST(address == NULL);
- dev->address = sock->peer_address;
- }
- }
- static void
- destroy_socketevent(isc_event_t *event) {
- isc_socketevent_t *ev = (isc_socketevent_t *)event;
- INSIST(ISC_LIST_EMPTY(ev->bufferlist));
- (ev->destroy)(event);
- }
- static isc_socketevent_t *
- allocate_socketevent(isc__socket_t *sock, isc_eventtype_t eventtype,
- isc_taskaction_t action, const void *arg)
- {
- isc_socketevent_t *ev;
- ev = (isc_socketevent_t *)isc_event_allocate(sock->manager->mctx,
- sock, eventtype,
- action, arg,
- sizeof(*ev));
- if (ev == NULL)
- return (NULL);
- ev->result = ISC_R_UNSET;
- ISC_LINK_INIT(ev, ev_link);
- ISC_LIST_INIT(ev->bufferlist);
- ev->region.base = NULL;
- ev->n = 0;
- ev->offset = 0;
- ev->attributes = 0;
- ev->destroy = ev->ev_destroy;
- ev->ev_destroy = destroy_socketevent;
- return (ev);
- }
- #if defined(ISC_SOCKET_DEBUG)
- static void
- dump_msg(struct msghdr *msg) {
- unsigned int i;
- printf("MSGHDR %p\n", msg);
- printf("\tname %p, namelen %ld\n", msg->msg_name,
- (long) msg->msg_namelen);
- printf("\tiov %p, iovlen %ld\n", msg->msg_iov,
- (long) msg->msg_iovlen);
- for (i = 0; i < (unsigned int)msg->msg_iovlen; i++)
- printf("\t\t%d\tbase %p, len %ld\n", i,
- msg->msg_iov[i].iov_base,
- (long) msg->msg_iov[i].iov_len);
- #ifdef ISC_NET_BSD44MSGHDR
- printf("\tcontrol %p, controllen %ld\n", msg->msg_control,
- (long) msg->msg_controllen);
- #endif
- }
- #endif
- #define DOIO_SUCCESS 0 /* i/o ok, event sent */
- #define DOIO_SOFT 1 /* i/o ok, soft error, no event sent */
- #define DOIO_HARD 2 /* i/o error, event sent */
- #define DOIO_EOF 3 /* EOF, no event sent */
- static int
- doio_recv(isc__socket_t *sock, isc_socketevent_t *dev) {
- int cc;
- struct iovec iov[MAXSCATTERGATHER_RECV];
- size_t read_count;
- size_t actual_count;
- struct msghdr msghdr;
- isc_buffer_t *buffer;
- int recv_errno;
- char strbuf[ISC_STRERRORSIZE];
- build_msghdr_recv(sock, dev, &msghdr, iov, &read_count);
- #if defined(ISC_SOCKET_DEBUG)
- dump_msg(&msghdr);
- #endif
- cc = recvmsg(sock->fd, &msghdr, 0);
- recv_errno = errno;
- #if defined(ISC_SOCKET_DEBUG)
- dump_msg(&msghdr);
- #endif
- if (cc < 0) {
- if (SOFT_ERROR(recv_errno))
- return (DOIO_SOFT);
- if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
- isc__strerror(recv_errno, strbuf, sizeof(strbuf));
- socket_log(sock, NULL, IOEVENT,
- isc_msgcat, ISC_MSGSET_SOCKET,
- ISC_MSG_DOIORECV,
- "doio_recv: recvmsg(%d) %d bytes, err %d/%s",
- sock->fd, cc, recv_errno, strbuf);
- }
- #define SOFT_OR_HARD(_system, _isc) \
- if (recv_errno == _system) { \
- if (sock->connected) { \
- dev->result = _isc; \
- inc_stats(sock->manager->stats, \
- sock->statsindex[STATID_RECVFAIL]); \
- return (DOIO_HARD); \
- } \
- return (DOIO_SOFT); \
- }
- #define ALWAYS_HARD(_system, _isc) \
- if (recv_errno == _system) { \
- dev->result = _isc; \
- inc_stats(sock->manager->stats, \
- sock->statsindex[STATID_RECVFAIL]); \
- return (DOIO_HARD); \
- }
- SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
- SOFT_OR_HARD(ENETUNREACH, ISC_R_NETUNREACH);
- SOFT_OR_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
- SOFT_OR_HARD(EHOSTDOWN, ISC_R_HOSTDOWN);
- /* HPUX 11.11 can return EADDRNOTAVAIL. */
- SOFT_OR_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
- ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES);
- /*
- * HPUX returns EPROTO and EINVAL on receiving some ICMP/ICMPv6
- * errors.
- */
- #ifdef EPROTO
- SOFT_OR_HARD(EPROTO, ISC_R_HOSTUNREACH);
- #endif
- SOFT_OR_HARD(EINVAL, ISC_R_HOSTUNREACH);
- #undef SOFT_OR_HARD
- #undef ALWAYS_HARD
- dev->result = isc__errno2result(recv_errno);
- inc_stats(sock->manager->stats,
- sock->statsindex[STATID_RECVFAIL]);
- return (DOIO_HARD);
- }
- /*
- * On TCP and UNIX sockets, zero length reads indicate EOF,
- * while on UDP sockets, zero length reads are perfectly valid,
- * although strange.
- */
- switch (sock->type) {
- case isc_sockettype_tcp:
- case isc_sockettype_unix:
- if (cc == 0)
- return (DOIO_EOF);
- break;
- case isc_sockettype_udp:
- break;
- case isc_sockettype_fdwatch:
- default:
- INSIST(0);
- }
- if (sock->type == isc_sockettype_udp) {
- dev->address.length = msghdr.msg_namelen;
- if (isc_sockaddr_getport(&dev->address) == 0) {
- if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
- socket_log(sock, &dev->address, IOEVENT,
- isc_msgcat, ISC_MSGSET_SOCKET,
- ISC_MSG_ZEROPORT,
- "dropping source port zero packet");
- }
- return (DOIO_SOFT);
- }
- /*
- * Simulate a firewall blocking UDP responses bigger than
- * 512 bytes.
- */
- if (sock->manager->maxudp != 0 && cc > sock->manager->maxudp)
- return (DOIO_SOFT);
- }
- socket_log(sock, &dev->address, IOEVENT,
- isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_PKTRECV,
- "packet received correctly");
- /*
- * Overflow bit detection. If we received MORE bytes than we should,
- * this indicates an overflow situation. Set the flag in the
- * dev entry and adjust how much we read by one.
- */
- #ifdef ISC_NET_RECVOVERFLOW
- if ((sock->type == isc_sockettype_udp) && ((size_t)cc > read_count)) {
- dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
- cc--;
- }
- #endif
- /*
- * If there are control messages attached, run through them and pull
- * out the interesting bits.
- */
- if (sock->type == isc_sockettype_udp)
- process_cmsg(sock, &msghdr, dev);
- /*
- * update the buffers (if any) and the i/o count
- */
- dev->n += cc;
- actual_count = cc;
- buffer = ISC_LIST_HEAD(dev->bufferlist);
- while (buffer != NULL && actual_count > 0U) {
- REQUIRE(ISC_BUFFER_VALID(buffer));
- if (isc_buffer_availablelength(buffer) <= actual_count) {
- actual_count -= isc_buffer_availablelength(buffer);
- isc_buffer_add(buffer,
- isc_buffer_availablelength(buffer));
- } else {
- isc_buffer_add(buffer, actual_count);
- actual_count = 0;
- POST(actual_count);
- break;
- }
- buffer = ISC_LIST_NEXT(buffer, link);
- if (buffer == NULL) {
- INSIST(actual_count == 0U);
- }
- }
- /*
- * If we read less than we expected, update counters,
- * and let the upper layer poke the descriptor.
- */
- if (((size_t)cc != read_count) && (dev->n < dev->minimum))
- return (DOIO_SOFT);
- /*
- * Full reads are posted, or partials if partials are ok.
- */
- dev->result = ISC_R_SUCCESS;
- return (DOIO_SUCCESS);
- }
- /*
- * Returns:
- * DOIO_SUCCESS The operation succeeded. dev->result contai…