PageRenderTime 174ms CodeModel.GetById 67ms app.highlight 88ms RepoModel.GetById 1ms app.codeStats 1ms

/contrib/bind9/lib/isc/unix/socket.c

https://bitbucket.org/freebsd/freebsd-head/
C | 5944 lines | 4372 code | 781 blank | 791 comment | 886 complexity | 2bc52ee0f9ebc7e39a20f5b2ce900a03 MD5 | raw file

Large files files are truncated, but you can click here to view the full file

   1/*
   2 * Copyright (C) 2004-2012  Internet Systems Consortium, Inc. ("ISC")
   3 * Copyright (C) 1998-2003  Internet Software Consortium.
   4 *
   5 * Permission to use, copy, modify, and/or distribute this software for any
   6 * purpose with or without fee is hereby granted, provided that the above
   7 * copyright notice and this permission notice appear in all copies.
   8 *
   9 * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
  10 * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
  11 * AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
  12 * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
  13 * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
  14 * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
  15 * PERFORMANCE OF THIS SOFTWARE.
  16 */
  17
  18/* $Id$ */
  19
  20/*! \file */
  21
  22#include <config.h>
  23
  24#include <sys/param.h>
  25#include <sys/types.h>
  26#include <sys/socket.h>
  27#include <sys/stat.h>
  28#include <sys/time.h>
  29#include <sys/uio.h>
  30
  31#include <errno.h>
  32#include <fcntl.h>
  33#include <stddef.h>
  34#include <stdlib.h>
  35#include <string.h>
  36#include <unistd.h>
  37
  38#include <isc/buffer.h>
  39#include <isc/bufferlist.h>
  40#include <isc/condition.h>
  41#include <isc/formatcheck.h>
  42#include <isc/list.h>
  43#include <isc/log.h>
  44#include <isc/mem.h>
  45#include <isc/msgs.h>
  46#include <isc/mutex.h>
  47#include <isc/net.h>
  48#include <isc/once.h>
  49#include <isc/platform.h>
  50#include <isc/print.h>
  51#include <isc/region.h>
  52#include <isc/socket.h>
  53#include <isc/stats.h>
  54#include <isc/strerror.h>
  55#include <isc/task.h>
  56#include <isc/thread.h>
  57#include <isc/util.h>
  58#include <isc/xml.h>
  59
  60#ifdef ISC_PLATFORM_HAVESYSUNH
  61#include <sys/un.h>
  62#endif
  63#ifdef ISC_PLATFORM_HAVEKQUEUE
  64#include <sys/event.h>
  65#endif
  66#ifdef ISC_PLATFORM_HAVEEPOLL
  67#include <sys/epoll.h>
  68#endif
  69#ifdef ISC_PLATFORM_HAVEDEVPOLL
  70#if defined(HAVE_SYS_DEVPOLL_H)
  71#include <sys/devpoll.h>
  72#elif defined(HAVE_DEVPOLL_H)
  73#include <devpoll.h>
  74#endif
  75#endif
  76
  77#include "errno2result.h"
  78
  79/* See task.c about the following definition: */
  80#ifdef BIND9
  81#ifdef ISC_PLATFORM_USETHREADS
  82#define USE_WATCHER_THREAD
  83#else
  84#define USE_SHARED_MANAGER
  85#endif	/* ISC_PLATFORM_USETHREADS */
  86#endif	/* BIND9 */
  87
  88#ifndef USE_WATCHER_THREAD
  89#include "socket_p.h"
  90#include "../task_p.h"
  91#endif /* USE_WATCHER_THREAD */
  92
  93#if defined(SO_BSDCOMPAT) && defined(__linux__)
  94#include <sys/utsname.h>
  95#endif
  96
  97/*%
  98 * Choose the most preferable multiplex method.
  99 */
 100#ifdef ISC_PLATFORM_HAVEKQUEUE
 101#define USE_KQUEUE
 102#elif defined (ISC_PLATFORM_HAVEEPOLL)
 103#define USE_EPOLL
 104#elif defined (ISC_PLATFORM_HAVEDEVPOLL)
 105#define USE_DEVPOLL
 106typedef struct {
 107	unsigned int want_read : 1,
 108		want_write : 1;
 109} pollinfo_t;
 110#else
 111#define USE_SELECT
 112#endif	/* ISC_PLATFORM_HAVEKQUEUE */
 113
 114#ifndef USE_WATCHER_THREAD
 115#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
 116struct isc_socketwait {
 117	int nevents;
 118};
 119#elif defined (USE_SELECT)
 120struct isc_socketwait {
 121	fd_set *readset;
 122	fd_set *writeset;
 123	int nfds;
 124	int maxfd;
 125};
 126#endif	/* USE_KQUEUE */
 127#endif /* !USE_WATCHER_THREAD */
 128
 129/*%
 130 * Maximum number of allowable open sockets.  This is also the maximum
 131 * allowable socket file descriptor.
 132 *
 133 * Care should be taken before modifying this value for select():
 134 * The API standard doesn't ensure select() accept more than (the system default
 135 * of) FD_SETSIZE descriptors, and the default size should in fact be fine in
 136 * the vast majority of cases.  This constant should therefore be increased only
 137 * when absolutely necessary and possible, i.e., the server is exhausting all
 138 * available file descriptors (up to FD_SETSIZE) and the select() function
 139 * and FD_xxx macros support larger values than FD_SETSIZE (which may not
 140 * always by true, but we keep using some of them to ensure as much
 141 * portability as possible).  Note also that overall server performance
 142 * may be rather worsened with a larger value of this constant due to
 143 * inherent scalability problems of select().
 144 *
 145 * As a special note, this value shouldn't have to be touched if
 146 * this is a build for an authoritative only DNS server.
 147 */
 148#ifndef ISC_SOCKET_MAXSOCKETS
 149#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
 150#define ISC_SOCKET_MAXSOCKETS 4096
 151#elif defined(USE_SELECT)
 152#define ISC_SOCKET_MAXSOCKETS FD_SETSIZE
 153#endif	/* USE_KQUEUE... */
 154#endif	/* ISC_SOCKET_MAXSOCKETS */
 155
 156#ifdef USE_SELECT
 157/*%
 158 * Mac OS X needs a special definition to support larger values in select().
 159 * We always define this because a larger value can be specified run-time.
 160 */
 161#ifdef __APPLE__
 162#define _DARWIN_UNLIMITED_SELECT
 163#endif	/* __APPLE__ */
 164#endif	/* USE_SELECT */
 165
 166#ifdef ISC_SOCKET_USE_POLLWATCH
 167/*%
 168 * If this macro is defined, enable workaround for a Solaris /dev/poll kernel
 169 * bug: DP_POLL ioctl could keep sleeping even if socket I/O is possible for
 170 * some of the specified FD.  The idea is based on the observation that it's
 171 * likely for a busy server to keep receiving packets.  It specifically works
 172 * as follows: the socket watcher is first initialized with the state of
 173 * "poll_idle".  While it's in the idle state it keeps sleeping until a socket
 174 * event occurs.  When it wakes up for a socket I/O event, it moves to the
 175 * poll_active state, and sets the poll timeout to a short period
 176 * (ISC_SOCKET_POLLWATCH_TIMEOUT msec).  If timeout occurs in this state, the
 177 * watcher goes to the poll_checking state with the same timeout period.
 178 * In this state, the watcher tries to detect whether this is a break
 179 * during intermittent events or the kernel bug is triggered.  If the next
 180 * polling reports an event within the short period, the previous timeout is
 181 * likely to be a kernel bug, and so the watcher goes back to the active state.
 182 * Otherwise, it moves to the idle state again.
 183 *
 184 * It's not clear whether this is a thread-related bug, but since we've only
 185 * seen this with threads, this workaround is used only when enabling threads.
 186 */
 187
 188typedef enum { poll_idle, poll_active, poll_checking } pollstate_t;
 189
 190#ifndef ISC_SOCKET_POLLWATCH_TIMEOUT
 191#define ISC_SOCKET_POLLWATCH_TIMEOUT 10
 192#endif	/* ISC_SOCKET_POLLWATCH_TIMEOUT */
 193#endif	/* ISC_SOCKET_USE_POLLWATCH */
 194
 195/*%
 196 * Size of per-FD lock buckets.
 197 */
 198#ifdef ISC_PLATFORM_USETHREADS
 199#define FDLOCK_COUNT		1024
 200#define FDLOCK_ID(fd)		((fd) % FDLOCK_COUNT)
 201#else
 202#define FDLOCK_COUNT		1
 203#define FDLOCK_ID(fd)		0
 204#endif	/* ISC_PLATFORM_USETHREADS */
 205
 206/*%
 207 * Maximum number of events communicated with the kernel.  There should normally
 208 * be no need for having a large number.
 209 */
 210#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
 211#ifndef ISC_SOCKET_MAXEVENTS
 212#define ISC_SOCKET_MAXEVENTS	64
 213#endif
 214#endif
 215
 216/*%
 217 * Some systems define the socket length argument as an int, some as size_t,
 218 * some as socklen_t.  This is here so it can be easily changed if needed.
 219 */
 220#ifndef ISC_SOCKADDR_LEN_T
 221#define ISC_SOCKADDR_LEN_T unsigned int
 222#endif
 223
 224/*%
 225 * Define what the possible "soft" errors can be.  These are non-fatal returns
 226 * of various network related functions, like recv() and so on.
 227 *
 228 * For some reason, BSDI (and perhaps others) will sometimes return <0
 229 * from recv() but will have errno==0.  This is broken, but we have to
 230 * work around it here.
 231 */
 232#define SOFT_ERROR(e)	((e) == EAGAIN || \
 233			 (e) == EWOULDBLOCK || \
 234			 (e) == EINTR || \
 235			 (e) == 0)
 236
 237#define DLVL(x) ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_DEBUG(x)
 238
 239/*!<
 240 * DLVL(90)  --  Function entry/exit and other tracing.
 241 * DLVL(70)  --  Socket "correctness" -- including returning of events, etc.
 242 * DLVL(60)  --  Socket data send/receive
 243 * DLVL(50)  --  Event tracing, including receiving/sending completion events.
 244 * DLVL(20)  --  Socket creation/destruction.
 245 */
 246#define TRACE_LEVEL		90
 247#define CORRECTNESS_LEVEL	70
 248#define IOEVENT_LEVEL		60
 249#define EVENT_LEVEL		50
 250#define CREATION_LEVEL		20
 251
 252#define TRACE		DLVL(TRACE_LEVEL)
 253#define CORRECTNESS	DLVL(CORRECTNESS_LEVEL)
 254#define IOEVENT		DLVL(IOEVENT_LEVEL)
 255#define EVENT		DLVL(EVENT_LEVEL)
 256#define CREATION	DLVL(CREATION_LEVEL)
 257
 258typedef isc_event_t intev_t;
 259
 260#define SOCKET_MAGIC		ISC_MAGIC('I', 'O', 'i', 'o')
 261#define VALID_SOCKET(s)		ISC_MAGIC_VALID(s, SOCKET_MAGIC)
 262
 263/*!
 264 * IPv6 control information.  If the socket is an IPv6 socket we want
 265 * to collect the destination address and interface so the client can
 266 * set them on outgoing packets.
 267 */
 268#ifdef ISC_PLATFORM_HAVEIN6PKTINFO
 269#ifndef USE_CMSG
 270#define USE_CMSG	1
 271#endif
 272#endif
 273
 274/*%
 275 * NetBSD and FreeBSD can timestamp packets.  XXXMLG Should we have
 276 * a setsockopt() like interface to request timestamps, and if the OS
 277 * doesn't do it for us, call gettimeofday() on every UDP receive?
 278 */
 279#ifdef SO_TIMESTAMP
 280#ifndef USE_CMSG
 281#define USE_CMSG	1
 282#endif
 283#endif
 284
 285/*%
 286 * The size to raise the receive buffer to (from BIND 8).
 287 */
 288#define RCVBUFSIZE (32*1024)
 289
 290/*%
 291 * The number of times a send operation is repeated if the result is EINTR.
 292 */
 293#define NRETRIES 10
 294
 295typedef struct isc__socket isc__socket_t;
 296typedef struct isc__socketmgr isc__socketmgr_t;
 297
 298#define NEWCONNSOCK(ev) ((isc__socket_t *)(ev)->newsocket)
 299
 300struct isc__socket {
 301	/* Not locked. */
 302	isc_socket_t		common;
 303	isc__socketmgr_t	*manager;
 304	isc_mutex_t		lock;
 305	isc_sockettype_t	type;
 306	const isc_statscounter_t	*statsindex;
 307
 308	/* Locked by socket lock. */
 309	ISC_LINK(isc__socket_t)	link;
 310	unsigned int		references;
 311	int			fd;
 312	int			pf;
 313	char				name[16];
 314	void *				tag;
 315
 316	ISC_LIST(isc_socketevent_t)		send_list;
 317	ISC_LIST(isc_socketevent_t)		recv_list;
 318	ISC_LIST(isc_socket_newconnev_t)	accept_list;
 319	isc_socket_connev_t		       *connect_ev;
 320
 321	/*
 322	 * Internal events.  Posted when a descriptor is readable or
 323	 * writable.  These are statically allocated and never freed.
 324	 * They will be set to non-purgable before use.
 325	 */
 326	intev_t			readable_ev;
 327	intev_t			writable_ev;
 328
 329	isc_sockaddr_t		peer_address;  /* remote address */
 330
 331	unsigned int		pending_recv : 1,
 332				pending_send : 1,
 333				pending_accept : 1,
 334				listener : 1, /* listener socket */
 335				connected : 1,
 336				connecting : 1, /* connect pending */
 337				bound : 1; /* bound to local addr */
 338
 339#ifdef ISC_NET_RECVOVERFLOW
 340	unsigned char		overflow; /* used for MSG_TRUNC fake */
 341#endif
 342
 343	char			*recvcmsgbuf;
 344	ISC_SOCKADDR_LEN_T	recvcmsgbuflen;
 345	char			*sendcmsgbuf;
 346	ISC_SOCKADDR_LEN_T	sendcmsgbuflen;
 347
 348	void			*fdwatcharg;
 349	isc_sockfdwatch_t	fdwatchcb;
 350	int			fdwatchflags;
 351	isc_task_t		*fdwatchtask;
 352};
 353
 354#define SOCKET_MANAGER_MAGIC	ISC_MAGIC('I', 'O', 'm', 'g')
 355#define VALID_MANAGER(m)	ISC_MAGIC_VALID(m, SOCKET_MANAGER_MAGIC)
 356
 357struct isc__socketmgr {
 358	/* Not locked. */
 359	isc_socketmgr_t		common;
 360	isc_mem_t	       *mctx;
 361	isc_mutex_t		lock;
 362	isc_mutex_t		*fdlock;
 363	isc_stats_t		*stats;
 364#ifdef USE_KQUEUE
 365	int			kqueue_fd;
 366	int			nevents;
 367	struct kevent		*events;
 368#endif	/* USE_KQUEUE */
 369#ifdef USE_EPOLL
 370	int			epoll_fd;
 371	int			nevents;
 372	struct epoll_event	*events;
 373#endif	/* USE_EPOLL */
 374#ifdef USE_DEVPOLL
 375	int			devpoll_fd;
 376	int			nevents;
 377	struct pollfd		*events;
 378#endif	/* USE_DEVPOLL */
 379#ifdef USE_SELECT
 380	int			fd_bufsize;
 381#endif	/* USE_SELECT */
 382	unsigned int		maxsocks;
 383#ifdef ISC_PLATFORM_USETHREADS
 384	int			pipe_fds[2];
 385#endif
 386
 387	/* Locked by fdlock. */
 388	isc__socket_t	       **fds;
 389	int			*fdstate;
 390#ifdef USE_DEVPOLL
 391	pollinfo_t		*fdpollinfo;
 392#endif
 393
 394	/* Locked by manager lock. */
 395	ISC_LIST(isc__socket_t)	socklist;
 396#ifdef USE_SELECT
 397	fd_set			*read_fds;
 398	fd_set			*read_fds_copy;
 399	fd_set			*write_fds;
 400	fd_set			*write_fds_copy;
 401	int			maxfd;
 402#endif	/* USE_SELECT */
 403	int			reserved;	/* unlocked */
 404#ifdef USE_WATCHER_THREAD
 405	isc_thread_t		watcher;
 406	isc_condition_t		shutdown_ok;
 407#else /* USE_WATCHER_THREAD */
 408	unsigned int		refs;
 409#endif /* USE_WATCHER_THREAD */
 410	int			maxudp;
 411};
 412
 413#ifdef USE_SHARED_MANAGER
 414static isc__socketmgr_t *socketmgr = NULL;
 415#endif /* USE_SHARED_MANAGER */
 416
 417#define CLOSED			0	/* this one must be zero */
 418#define MANAGED			1
 419#define CLOSE_PENDING		2
 420
 421/*
 422 * send() and recv() iovec counts
 423 */
 424#define MAXSCATTERGATHER_SEND	(ISC_SOCKET_MAXSCATTERGATHER)
 425#ifdef ISC_NET_RECVOVERFLOW
 426# define MAXSCATTERGATHER_RECV	(ISC_SOCKET_MAXSCATTERGATHER + 1)
 427#else
 428# define MAXSCATTERGATHER_RECV	(ISC_SOCKET_MAXSCATTERGATHER)
 429#endif
 430
 431static void send_recvdone_event(isc__socket_t *, isc_socketevent_t **);
 432static void send_senddone_event(isc__socket_t *, isc_socketevent_t **);
 433static void free_socket(isc__socket_t **);
 434static isc_result_t allocate_socket(isc__socketmgr_t *, isc_sockettype_t,
 435				    isc__socket_t **);
 436static void destroy(isc__socket_t **);
 437static void internal_accept(isc_task_t *, isc_event_t *);
 438static void internal_connect(isc_task_t *, isc_event_t *);
 439static void internal_recv(isc_task_t *, isc_event_t *);
 440static void internal_send(isc_task_t *, isc_event_t *);
 441static void internal_fdwatch_write(isc_task_t *, isc_event_t *);
 442static void internal_fdwatch_read(isc_task_t *, isc_event_t *);
 443static void process_cmsg(isc__socket_t *, struct msghdr *, isc_socketevent_t *);
 444static void build_msghdr_send(isc__socket_t *, isc_socketevent_t *,
 445			      struct msghdr *, struct iovec *, size_t *);
 446static void build_msghdr_recv(isc__socket_t *, isc_socketevent_t *,
 447			      struct msghdr *, struct iovec *, size_t *);
 448#ifdef USE_WATCHER_THREAD
 449static isc_boolean_t process_ctlfd(isc__socketmgr_t *manager);
 450#endif
 451
 452/*%
 453 * The following can be either static or public, depending on build environment.
 454 */
 455
 456#ifdef BIND9
 457#define ISC_SOCKETFUNC_SCOPE
 458#else
 459#define ISC_SOCKETFUNC_SCOPE static
 460#endif
 461
 462ISC_SOCKETFUNC_SCOPE isc_result_t
 463isc__socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
 464		   isc_socket_t **socketp);
 465ISC_SOCKETFUNC_SCOPE void
 466isc__socket_attach(isc_socket_t *sock, isc_socket_t **socketp);
 467ISC_SOCKETFUNC_SCOPE void
 468isc__socket_detach(isc_socket_t **socketp);
 469ISC_SOCKETFUNC_SCOPE isc_result_t
 470isc__socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp);
 471ISC_SOCKETFUNC_SCOPE isc_result_t
 472isc__socketmgr_create2(isc_mem_t *mctx, isc_socketmgr_t **managerp,
 473		       unsigned int maxsocks);
 474ISC_SOCKETFUNC_SCOPE void
 475isc__socketmgr_destroy(isc_socketmgr_t **managerp);
 476ISC_SOCKETFUNC_SCOPE isc_result_t
 477isc__socket_recvv(isc_socket_t *sock, isc_bufferlist_t *buflist,
 478		 unsigned int minimum, isc_task_t *task,
 479		  isc_taskaction_t action, const void *arg);
 480ISC_SOCKETFUNC_SCOPE isc_result_t
 481isc__socket_recv(isc_socket_t *sock, isc_region_t *region,
 482		 unsigned int minimum, isc_task_t *task,
 483		 isc_taskaction_t action, const void *arg);
 484ISC_SOCKETFUNC_SCOPE isc_result_t
 485isc__socket_recv2(isc_socket_t *sock, isc_region_t *region,
 486		  unsigned int minimum, isc_task_t *task,
 487		  isc_socketevent_t *event, unsigned int flags);
 488ISC_SOCKETFUNC_SCOPE isc_result_t
 489isc__socket_send(isc_socket_t *sock, isc_region_t *region,
 490		 isc_task_t *task, isc_taskaction_t action, const void *arg);
 491ISC_SOCKETFUNC_SCOPE isc_result_t
 492isc__socket_sendto(isc_socket_t *sock, isc_region_t *region,
 493		   isc_task_t *task, isc_taskaction_t action, const void *arg,
 494		   isc_sockaddr_t *address, struct in6_pktinfo *pktinfo);
 495ISC_SOCKETFUNC_SCOPE isc_result_t
 496isc__socket_sendv(isc_socket_t *sock, isc_bufferlist_t *buflist,
 497		  isc_task_t *task, isc_taskaction_t action, const void *arg);
 498ISC_SOCKETFUNC_SCOPE isc_result_t
 499isc__socket_sendtov(isc_socket_t *sock, isc_bufferlist_t *buflist,
 500		    isc_task_t *task, isc_taskaction_t action, const void *arg,
 501		    isc_sockaddr_t *address, struct in6_pktinfo *pktinfo);
 502ISC_SOCKETFUNC_SCOPE isc_result_t
 503isc__socket_sendto2(isc_socket_t *sock, isc_region_t *region,
 504		    isc_task_t *task,
 505		    isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
 506		    isc_socketevent_t *event, unsigned int flags);
 507ISC_SOCKETFUNC_SCOPE void
 508isc__socket_cleanunix(isc_sockaddr_t *sockaddr, isc_boolean_t active);
 509ISC_SOCKETFUNC_SCOPE isc_result_t
 510isc__socket_permunix(isc_sockaddr_t *sockaddr, isc_uint32_t perm,
 511		     isc_uint32_t owner, isc_uint32_t group);
 512ISC_SOCKETFUNC_SCOPE isc_result_t
 513isc__socket_bind(isc_socket_t *sock, isc_sockaddr_t *sockaddr,
 514		 unsigned int options);
 515ISC_SOCKETFUNC_SCOPE isc_result_t
 516isc__socket_filter(isc_socket_t *sock, const char *filter);
 517ISC_SOCKETFUNC_SCOPE isc_result_t
 518isc__socket_listen(isc_socket_t *sock, unsigned int backlog);
 519ISC_SOCKETFUNC_SCOPE isc_result_t
 520isc__socket_accept(isc_socket_t *sock,
 521		   isc_task_t *task, isc_taskaction_t action, const void *arg);
 522ISC_SOCKETFUNC_SCOPE isc_result_t
 523isc__socket_connect(isc_socket_t *sock, isc_sockaddr_t *addr,
 524		    isc_task_t *task, isc_taskaction_t action,
 525		    const void *arg);
 526ISC_SOCKETFUNC_SCOPE isc_result_t
 527isc__socket_getpeername(isc_socket_t *sock, isc_sockaddr_t *addressp);
 528ISC_SOCKETFUNC_SCOPE isc_result_t
 529isc__socket_getsockname(isc_socket_t *sock, isc_sockaddr_t *addressp);
 530ISC_SOCKETFUNC_SCOPE void
 531isc__socket_cancel(isc_socket_t *sock, isc_task_t *task, unsigned int how);
 532ISC_SOCKETFUNC_SCOPE isc_sockettype_t
 533isc__socket_gettype(isc_socket_t *sock);
 534ISC_SOCKETFUNC_SCOPE isc_boolean_t
 535isc__socket_isbound(isc_socket_t *sock);
 536ISC_SOCKETFUNC_SCOPE void
 537isc__socket_ipv6only(isc_socket_t *sock, isc_boolean_t yes);
 538#if defined(HAVE_LIBXML2) && defined(BIND9)
 539ISC_SOCKETFUNC_SCOPE void
 540isc__socketmgr_renderxml(isc_socketmgr_t *mgr0, xmlTextWriterPtr writer);
 541#endif
 542
 543ISC_SOCKETFUNC_SCOPE isc_result_t
 544isc__socket_fdwatchcreate(isc_socketmgr_t *manager, int fd, int flags,
 545			  isc_sockfdwatch_t callback, void *cbarg,
 546			  isc_task_t *task, isc_socket_t **socketp);
 547ISC_SOCKETFUNC_SCOPE isc_result_t
 548isc__socket_fdwatchpoke(isc_socket_t *sock, int flags);
 549
 550static struct {
 551	isc_socketmethods_t methods;
 552
 553	/*%
 554	 * The following are defined just for avoiding unused static functions.
 555	 */
 556#ifndef BIND9
 557	void *recvv, *send, *sendv, *sendto2, *cleanunix, *permunix, *filter,
 558		*listen, *accept, *getpeername, *isbound;
 559#endif
 560} socketmethods = {
 561	{
 562		isc__socket_attach,
 563		isc__socket_detach,
 564		isc__socket_bind,
 565		isc__socket_sendto,
 566		isc__socket_connect,
 567		isc__socket_recv,
 568		isc__socket_cancel,
 569		isc__socket_getsockname,
 570		isc__socket_gettype,
 571		isc__socket_ipv6only,
 572		isc__socket_fdwatchpoke
 573	}
 574#ifndef BIND9
 575	,
 576	(void *)isc__socket_recvv, (void *)isc__socket_send,
 577	(void *)isc__socket_sendv, (void *)isc__socket_sendto2,
 578	(void *)isc__socket_cleanunix, (void *)isc__socket_permunix,
 579	(void *)isc__socket_filter, (void *)isc__socket_listen,
 580	(void *)isc__socket_accept, (void *)isc__socket_getpeername,
 581	(void *)isc__socket_isbound
 582#endif
 583};
 584
 585static isc_socketmgrmethods_t socketmgrmethods = {
 586	isc__socketmgr_destroy,
 587	isc__socket_create,
 588	isc__socket_fdwatchcreate
 589};
 590
 591#define SELECT_POKE_SHUTDOWN		(-1)
 592#define SELECT_POKE_NOTHING		(-2)
 593#define SELECT_POKE_READ		(-3)
 594#define SELECT_POKE_ACCEPT		(-3) /*%< Same as _READ */
 595#define SELECT_POKE_WRITE		(-4)
 596#define SELECT_POKE_CONNECT		(-4) /*%< Same as _WRITE */
 597#define SELECT_POKE_CLOSE		(-5)
 598
 599#define SOCK_DEAD(s)			((s)->references == 0)
 600
 601/*%
 602 * Shortcut index arrays to get access to statistics counters.
 603 */
 604enum {
 605	STATID_OPEN = 0,
 606	STATID_OPENFAIL = 1,
 607	STATID_CLOSE = 2,
 608	STATID_BINDFAIL = 3,
 609	STATID_CONNECTFAIL = 4,
 610	STATID_CONNECT = 5,
 611	STATID_ACCEPTFAIL = 6,
 612	STATID_ACCEPT = 7,
 613	STATID_SENDFAIL = 8,
 614	STATID_RECVFAIL = 9
 615};
 616static const isc_statscounter_t upd4statsindex[] = {
 617	isc_sockstatscounter_udp4open,
 618	isc_sockstatscounter_udp4openfail,
 619	isc_sockstatscounter_udp4close,
 620	isc_sockstatscounter_udp4bindfail,
 621	isc_sockstatscounter_udp4connectfail,
 622	isc_sockstatscounter_udp4connect,
 623	-1,
 624	-1,
 625	isc_sockstatscounter_udp4sendfail,
 626	isc_sockstatscounter_udp4recvfail
 627};
 628static const isc_statscounter_t upd6statsindex[] = {
 629	isc_sockstatscounter_udp6open,
 630	isc_sockstatscounter_udp6openfail,
 631	isc_sockstatscounter_udp6close,
 632	isc_sockstatscounter_udp6bindfail,
 633	isc_sockstatscounter_udp6connectfail,
 634	isc_sockstatscounter_udp6connect,
 635	-1,
 636	-1,
 637	isc_sockstatscounter_udp6sendfail,
 638	isc_sockstatscounter_udp6recvfail
 639};
 640static const isc_statscounter_t tcp4statsindex[] = {
 641	isc_sockstatscounter_tcp4open,
 642	isc_sockstatscounter_tcp4openfail,
 643	isc_sockstatscounter_tcp4close,
 644	isc_sockstatscounter_tcp4bindfail,
 645	isc_sockstatscounter_tcp4connectfail,
 646	isc_sockstatscounter_tcp4connect,
 647	isc_sockstatscounter_tcp4acceptfail,
 648	isc_sockstatscounter_tcp4accept,
 649	isc_sockstatscounter_tcp4sendfail,
 650	isc_sockstatscounter_tcp4recvfail
 651};
 652static const isc_statscounter_t tcp6statsindex[] = {
 653	isc_sockstatscounter_tcp6open,
 654	isc_sockstatscounter_tcp6openfail,
 655	isc_sockstatscounter_tcp6close,
 656	isc_sockstatscounter_tcp6bindfail,
 657	isc_sockstatscounter_tcp6connectfail,
 658	isc_sockstatscounter_tcp6connect,
 659	isc_sockstatscounter_tcp6acceptfail,
 660	isc_sockstatscounter_tcp6accept,
 661	isc_sockstatscounter_tcp6sendfail,
 662	isc_sockstatscounter_tcp6recvfail
 663};
 664static const isc_statscounter_t unixstatsindex[] = {
 665	isc_sockstatscounter_unixopen,
 666	isc_sockstatscounter_unixopenfail,
 667	isc_sockstatscounter_unixclose,
 668	isc_sockstatscounter_unixbindfail,
 669	isc_sockstatscounter_unixconnectfail,
 670	isc_sockstatscounter_unixconnect,
 671	isc_sockstatscounter_unixacceptfail,
 672	isc_sockstatscounter_unixaccept,
 673	isc_sockstatscounter_unixsendfail,
 674	isc_sockstatscounter_unixrecvfail
 675};
 676static const isc_statscounter_t fdwatchstatsindex[] = {
 677	-1,
 678	-1,
 679	isc_sockstatscounter_fdwatchclose,
 680	isc_sockstatscounter_fdwatchbindfail,
 681	isc_sockstatscounter_fdwatchconnectfail,
 682	isc_sockstatscounter_fdwatchconnect,
 683	-1,
 684	-1,
 685	isc_sockstatscounter_fdwatchsendfail,
 686	isc_sockstatscounter_fdwatchrecvfail
 687};
 688
 689#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL) || \
 690    defined(USE_WATCHER_THREAD)
 691static void
 692manager_log(isc__socketmgr_t *sockmgr,
 693	    isc_logcategory_t *category, isc_logmodule_t *module, int level,
 694	    const char *fmt, ...) ISC_FORMAT_PRINTF(5, 6);
 695static void
 696manager_log(isc__socketmgr_t *sockmgr,
 697	    isc_logcategory_t *category, isc_logmodule_t *module, int level,
 698	    const char *fmt, ...)
 699{
 700	char msgbuf[2048];
 701	va_list ap;
 702
 703	if (! isc_log_wouldlog(isc_lctx, level))
 704		return;
 705
 706	va_start(ap, fmt);
 707	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
 708	va_end(ap);
 709
 710	isc_log_write(isc_lctx, category, module, level,
 711		      "sockmgr %p: %s", sockmgr, msgbuf);
 712}
 713#endif
 714
 715static void
 716socket_log(isc__socket_t *sock, isc_sockaddr_t *address,
 717	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
 718	   isc_msgcat_t *msgcat, int msgset, int message,
 719	   const char *fmt, ...) ISC_FORMAT_PRINTF(9, 10);
 720static void
 721socket_log(isc__socket_t *sock, isc_sockaddr_t *address,
 722	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
 723	   isc_msgcat_t *msgcat, int msgset, int message,
 724	   const char *fmt, ...)
 725{
 726	char msgbuf[2048];
 727	char peerbuf[ISC_SOCKADDR_FORMATSIZE];
 728	va_list ap;
 729
 730	if (! isc_log_wouldlog(isc_lctx, level))
 731		return;
 732
 733	va_start(ap, fmt);
 734	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
 735	va_end(ap);
 736
 737	if (address == NULL) {
 738		isc_log_iwrite(isc_lctx, category, module, level,
 739			       msgcat, msgset, message,
 740			       "socket %p: %s", sock, msgbuf);
 741	} else {
 742		isc_sockaddr_format(address, peerbuf, sizeof(peerbuf));
 743		isc_log_iwrite(isc_lctx, category, module, level,
 744			       msgcat, msgset, message,
 745			       "socket %p %s: %s", sock, peerbuf, msgbuf);
 746	}
 747}
 748
 749#if defined(_AIX) && defined(ISC_NET_BSD44MSGHDR) && \
 750    defined(USE_CMSG) && defined(IPV6_RECVPKTINFO)
 751/*
 752 * AIX has a kernel bug where IPV6_RECVPKTINFO gets cleared by
 753 * setting IPV6_V6ONLY.
 754 */
 755static void
 756FIX_IPV6_RECVPKTINFO(isc__socket_t *sock)
 757{
 758	char strbuf[ISC_STRERRORSIZE];
 759	int on = 1;
 760
 761	if (sock->pf != AF_INET6 || sock->type != isc_sockettype_udp)
 762		return;
 763
 764	if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO,
 765		       (void *)&on, sizeof(on)) < 0) {
 766
 767		isc__strerror(errno, strbuf, sizeof(strbuf));
 768		UNEXPECTED_ERROR(__FILE__, __LINE__,
 769				 "setsockopt(%d, IPV6_RECVPKTINFO) "
 770				 "%s: %s", sock->fd,
 771				 isc_msgcat_get(isc_msgcat,
 772						ISC_MSGSET_GENERAL,
 773						ISC_MSG_FAILED,
 774						"failed"),
 775				 strbuf);
 776	}
 777}
 778#else
 779#define FIX_IPV6_RECVPKTINFO(sock) (void)0
 780#endif
 781
 782/*%
 783 * Increment socket-related statistics counters.
 784 */
 785static inline void
 786inc_stats(isc_stats_t *stats, isc_statscounter_t counterid) {
 787	REQUIRE(counterid != -1);
 788
 789	if (stats != NULL)
 790		isc_stats_increment(stats, counterid);
 791}
 792
 793static inline isc_result_t
 794watch_fd(isc__socketmgr_t *manager, int fd, int msg) {
 795	isc_result_t result = ISC_R_SUCCESS;
 796
 797#ifdef USE_KQUEUE
 798	struct kevent evchange;
 799
 800	memset(&evchange, 0, sizeof(evchange));
 801	if (msg == SELECT_POKE_READ)
 802		evchange.filter = EVFILT_READ;
 803	else
 804		evchange.filter = EVFILT_WRITE;
 805	evchange.flags = EV_ADD;
 806	evchange.ident = fd;
 807	if (kevent(manager->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0)
 808		result = isc__errno2result(errno);
 809
 810	return (result);
 811#elif defined(USE_EPOLL)
 812	struct epoll_event event;
 813
 814	if (msg == SELECT_POKE_READ)
 815		event.events = EPOLLIN;
 816	else
 817		event.events = EPOLLOUT;
 818	memset(&event.data, 0, sizeof(event.data));
 819	event.data.fd = fd;
 820	if (epoll_ctl(manager->epoll_fd, EPOLL_CTL_ADD, fd, &event) == -1 &&
 821	    errno != EEXIST) {
 822		result = isc__errno2result(errno);
 823	}
 824
 825	return (result);
 826#elif defined(USE_DEVPOLL)
 827	struct pollfd pfd;
 828	int lockid = FDLOCK_ID(fd);
 829
 830	memset(&pfd, 0, sizeof(pfd));
 831	if (msg == SELECT_POKE_READ)
 832		pfd.events = POLLIN;
 833	else
 834		pfd.events = POLLOUT;
 835	pfd.fd = fd;
 836	pfd.revents = 0;
 837	LOCK(&manager->fdlock[lockid]);
 838	if (write(manager->devpoll_fd, &pfd, sizeof(pfd)) == -1)
 839		result = isc__errno2result(errno);
 840	else {
 841		if (msg == SELECT_POKE_READ)
 842			manager->fdpollinfo[fd].want_read = 1;
 843		else
 844			manager->fdpollinfo[fd].want_write = 1;
 845	}
 846	UNLOCK(&manager->fdlock[lockid]);
 847
 848	return (result);
 849#elif defined(USE_SELECT)
 850	LOCK(&manager->lock);
 851	if (msg == SELECT_POKE_READ)
 852		FD_SET(fd, manager->read_fds);
 853	if (msg == SELECT_POKE_WRITE)
 854		FD_SET(fd, manager->write_fds);
 855	UNLOCK(&manager->lock);
 856
 857	return (result);
 858#endif
 859}
 860
 861static inline isc_result_t
 862unwatch_fd(isc__socketmgr_t *manager, int fd, int msg) {
 863	isc_result_t result = ISC_R_SUCCESS;
 864
 865#ifdef USE_KQUEUE
 866	struct kevent evchange;
 867
 868	memset(&evchange, 0, sizeof(evchange));
 869	if (msg == SELECT_POKE_READ)
 870		evchange.filter = EVFILT_READ;
 871	else
 872		evchange.filter = EVFILT_WRITE;
 873	evchange.flags = EV_DELETE;
 874	evchange.ident = fd;
 875	if (kevent(manager->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0)
 876		result = isc__errno2result(errno);
 877
 878	return (result);
 879#elif defined(USE_EPOLL)
 880	struct epoll_event event;
 881
 882	if (msg == SELECT_POKE_READ)
 883		event.events = EPOLLIN;
 884	else
 885		event.events = EPOLLOUT;
 886	memset(&event.data, 0, sizeof(event.data));
 887	event.data.fd = fd;
 888	if (epoll_ctl(manager->epoll_fd, EPOLL_CTL_DEL, fd, &event) == -1 &&
 889	    errno != ENOENT) {
 890		char strbuf[ISC_STRERRORSIZE];
 891		isc__strerror(errno, strbuf, sizeof(strbuf));
 892		UNEXPECTED_ERROR(__FILE__, __LINE__,
 893				 "epoll_ctl(DEL), %d: %s", fd, strbuf);
 894		result = ISC_R_UNEXPECTED;
 895	}
 896	return (result);
 897#elif defined(USE_DEVPOLL)
 898	struct pollfd pfds[2];
 899	size_t writelen = sizeof(pfds[0]);
 900	int lockid = FDLOCK_ID(fd);
 901
 902	memset(pfds, 0, sizeof(pfds));
 903	pfds[0].events = POLLREMOVE;
 904	pfds[0].fd = fd;
 905
 906	/*
 907	 * Canceling read or write polling via /dev/poll is tricky.  Since it
 908	 * only provides a way of canceling per FD, we may need to re-poll the
 909	 * socket for the other operation.
 910	 */
 911	LOCK(&manager->fdlock[lockid]);
 912	if (msg == SELECT_POKE_READ &&
 913	    manager->fdpollinfo[fd].want_write == 1) {
 914		pfds[1].events = POLLOUT;
 915		pfds[1].fd = fd;
 916		writelen += sizeof(pfds[1]);
 917	}
 918	if (msg == SELECT_POKE_WRITE &&
 919	    manager->fdpollinfo[fd].want_read == 1) {
 920		pfds[1].events = POLLIN;
 921		pfds[1].fd = fd;
 922		writelen += sizeof(pfds[1]);
 923	}
 924
 925	if (write(manager->devpoll_fd, pfds, writelen) == -1)
 926		result = isc__errno2result(errno);
 927	else {
 928		if (msg == SELECT_POKE_READ)
 929			manager->fdpollinfo[fd].want_read = 0;
 930		else
 931			manager->fdpollinfo[fd].want_write = 0;
 932	}
 933	UNLOCK(&manager->fdlock[lockid]);
 934
 935	return (result);
 936#elif defined(USE_SELECT)
 937	LOCK(&manager->lock);
 938	if (msg == SELECT_POKE_READ)
 939		FD_CLR(fd, manager->read_fds);
 940	else if (msg == SELECT_POKE_WRITE)
 941		FD_CLR(fd, manager->write_fds);
 942	UNLOCK(&manager->lock);
 943
 944	return (result);
 945#endif
 946}
 947
 948static void
 949wakeup_socket(isc__socketmgr_t *manager, int fd, int msg) {
 950	isc_result_t result;
 951	int lockid = FDLOCK_ID(fd);
 952
 953	/*
 954	 * This is a wakeup on a socket.  If the socket is not in the
 955	 * process of being closed, start watching it for either reads
 956	 * or writes.
 957	 */
 958
 959	INSIST(fd >= 0 && fd < (int)manager->maxsocks);
 960
 961	if (msg == SELECT_POKE_CLOSE) {
 962		/* No one should be updating fdstate, so no need to lock it */
 963		INSIST(manager->fdstate[fd] == CLOSE_PENDING);
 964		manager->fdstate[fd] = CLOSED;
 965		(void)unwatch_fd(manager, fd, SELECT_POKE_READ);
 966		(void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
 967		(void)close(fd);
 968		return;
 969	}
 970
 971	LOCK(&manager->fdlock[lockid]);
 972	if (manager->fdstate[fd] == CLOSE_PENDING) {
 973		UNLOCK(&manager->fdlock[lockid]);
 974
 975		/*
 976		 * We accept (and ignore) any error from unwatch_fd() as we are
 977		 * closing the socket, hoping it doesn't leave dangling state in
 978		 * the kernel.
 979		 * Note that unwatch_fd() must be called after releasing the
 980		 * fdlock; otherwise it could cause deadlock due to a lock order
 981		 * reversal.
 982		 */
 983		(void)unwatch_fd(manager, fd, SELECT_POKE_READ);
 984		(void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
 985		return;
 986	}
 987	if (manager->fdstate[fd] != MANAGED) {
 988		UNLOCK(&manager->fdlock[lockid]);
 989		return;
 990	}
 991	UNLOCK(&manager->fdlock[lockid]);
 992
 993	/*
 994	 * Set requested bit.
 995	 */
 996	result = watch_fd(manager, fd, msg);
 997	if (result != ISC_R_SUCCESS) {
 998		/*
 999		 * XXXJT: what should we do?  Ignoring the failure of watching
1000		 * a socket will make the application dysfunctional, but there
1001		 * seems to be no reasonable recovery process.
1002		 */
1003		isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
1004			      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
1005			      "failed to start watching FD (%d): %s",
1006			      fd, isc_result_totext(result));
1007	}
1008}
1009
1010#ifdef USE_WATCHER_THREAD
1011/*
1012 * Poke the select loop when there is something for us to do.
1013 * The write is required (by POSIX) to complete.  That is, we
1014 * will not get partial writes.
1015 */
1016static void
1017select_poke(isc__socketmgr_t *mgr, int fd, int msg) {
1018	int cc;
1019	int buf[2];
1020	char strbuf[ISC_STRERRORSIZE];
1021
1022	buf[0] = fd;
1023	buf[1] = msg;
1024
1025	do {
1026		cc = write(mgr->pipe_fds[1], buf, sizeof(buf));
1027#ifdef ENOSR
1028		/*
1029		 * Treat ENOSR as EAGAIN but loop slowly as it is
1030		 * unlikely to clear fast.
1031		 */
1032		if (cc < 0 && errno == ENOSR) {
1033			sleep(1);
1034			errno = EAGAIN;
1035		}
1036#endif
1037	} while (cc < 0 && SOFT_ERROR(errno));
1038
1039	if (cc < 0) {
1040		isc__strerror(errno, strbuf, sizeof(strbuf));
1041		FATAL_ERROR(__FILE__, __LINE__,
1042			    isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
1043					   ISC_MSG_WRITEFAILED,
1044					   "write() failed "
1045					   "during watcher poke: %s"),
1046			    strbuf);
1047	}
1048
1049	INSIST(cc == sizeof(buf));
1050}
1051
1052/*
1053 * Read a message on the internal fd.
1054 */
1055static void
1056select_readmsg(isc__socketmgr_t *mgr, int *fd, int *msg) {
1057	int buf[2];
1058	int cc;
1059	char strbuf[ISC_STRERRORSIZE];
1060
1061	cc = read(mgr->pipe_fds[0], buf, sizeof(buf));
1062	if (cc < 0) {
1063		*msg = SELECT_POKE_NOTHING;
1064		*fd = -1;	/* Silence compiler. */
1065		if (SOFT_ERROR(errno))
1066			return;
1067
1068		isc__strerror(errno, strbuf, sizeof(strbuf));
1069		FATAL_ERROR(__FILE__, __LINE__,
1070			    isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
1071					   ISC_MSG_READFAILED,
1072					   "read() failed "
1073					   "during watcher poke: %s"),
1074			    strbuf);
1075
1076		return;
1077	}
1078	INSIST(cc == sizeof(buf));
1079
1080	*fd = buf[0];
1081	*msg = buf[1];
1082}
1083#else /* USE_WATCHER_THREAD */
1084/*
1085 * Update the state of the socketmgr when something changes.
1086 */
1087static void
1088select_poke(isc__socketmgr_t *manager, int fd, int msg) {
1089	if (msg == SELECT_POKE_SHUTDOWN)
1090		return;
1091	else if (fd >= 0)
1092		wakeup_socket(manager, fd, msg);
1093	return;
1094}
1095#endif /* USE_WATCHER_THREAD */
1096
1097/*
1098 * Make a fd non-blocking.
1099 */
1100static isc_result_t
1101make_nonblock(int fd) {
1102	int ret;
1103	int flags;
1104	char strbuf[ISC_STRERRORSIZE];
1105#ifdef USE_FIONBIO_IOCTL
1106	int on = 1;
1107
1108	ret = ioctl(fd, FIONBIO, (char *)&on);
1109#else
1110	flags = fcntl(fd, F_GETFL, 0);
1111	flags |= PORT_NONBLOCK;
1112	ret = fcntl(fd, F_SETFL, flags);
1113#endif
1114
1115	if (ret == -1) {
1116		isc__strerror(errno, strbuf, sizeof(strbuf));
1117		UNEXPECTED_ERROR(__FILE__, __LINE__,
1118#ifdef USE_FIONBIO_IOCTL
1119				 "ioctl(%d, FIONBIO, &on): %s", fd,
1120#else
1121				 "fcntl(%d, F_SETFL, %d): %s", fd, flags,
1122#endif
1123				 strbuf);
1124
1125		return (ISC_R_UNEXPECTED);
1126	}
1127
1128	return (ISC_R_SUCCESS);
1129}
1130
1131#ifdef USE_CMSG
1132/*
1133 * Not all OSes support advanced CMSG macros: CMSG_LEN and CMSG_SPACE.
1134 * In order to ensure as much portability as possible, we provide wrapper
1135 * functions of these macros.
1136 * Note that cmsg_space() could run slow on OSes that do not have
1137 * CMSG_SPACE.
1138 */
1139static inline ISC_SOCKADDR_LEN_T
1140cmsg_len(ISC_SOCKADDR_LEN_T len) {
1141#ifdef CMSG_LEN
1142	return (CMSG_LEN(len));
1143#else
1144	ISC_SOCKADDR_LEN_T hdrlen;
1145
1146	/*
1147	 * Cast NULL so that any pointer arithmetic performed by CMSG_DATA
1148	 * is correct.
1149	 */
1150	hdrlen = (ISC_SOCKADDR_LEN_T)CMSG_DATA(((struct cmsghdr *)NULL));
1151	return (hdrlen + len);
1152#endif
1153}
1154
1155static inline ISC_SOCKADDR_LEN_T
1156cmsg_space(ISC_SOCKADDR_LEN_T len) {
1157#ifdef CMSG_SPACE
1158	return (CMSG_SPACE(len));
1159#else
1160	struct msghdr msg;
1161	struct cmsghdr *cmsgp;
1162	/*
1163	 * XXX: The buffer length is an ad-hoc value, but should be enough
1164	 * in a practical sense.
1165	 */
1166	char dummybuf[sizeof(struct cmsghdr) + 1024];
1167
1168	memset(&msg, 0, sizeof(msg));
1169	msg.msg_control = dummybuf;
1170	msg.msg_controllen = sizeof(dummybuf);
1171
1172	cmsgp = (struct cmsghdr *)dummybuf;
1173	cmsgp->cmsg_len = cmsg_len(len);
1174
1175	cmsgp = CMSG_NXTHDR(&msg, cmsgp);
1176	if (cmsgp != NULL)
1177		return ((char *)cmsgp - (char *)msg.msg_control);
1178	else
1179		return (0);
1180#endif
1181}
1182#endif /* USE_CMSG */
1183
1184/*
1185 * Process control messages received on a socket.
1186 */
1187static void
1188process_cmsg(isc__socket_t *sock, struct msghdr *msg, isc_socketevent_t *dev) {
1189#ifdef USE_CMSG
1190	struct cmsghdr *cmsgp;
1191#ifdef ISC_PLATFORM_HAVEIN6PKTINFO
1192	struct in6_pktinfo *pktinfop;
1193#endif
1194#ifdef SO_TIMESTAMP
1195	struct timeval *timevalp;
1196#endif
1197#endif
1198
1199	/*
1200	 * sock is used only when ISC_NET_BSD44MSGHDR and USE_CMSG are defined.
1201	 * msg and dev are used only when ISC_NET_BSD44MSGHDR is defined.
1202	 * They are all here, outside of the CPP tests, because it is
1203	 * more consistent with the usual ISC coding style.
1204	 */
1205	UNUSED(sock);
1206	UNUSED(msg);
1207	UNUSED(dev);
1208
1209#ifdef ISC_NET_BSD44MSGHDR
1210
1211#ifdef MSG_TRUNC
1212	if ((msg->msg_flags & MSG_TRUNC) == MSG_TRUNC)
1213		dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
1214#endif
1215
1216#ifdef MSG_CTRUNC
1217	if ((msg->msg_flags & MSG_CTRUNC) == MSG_CTRUNC)
1218		dev->attributes |= ISC_SOCKEVENTATTR_CTRUNC;
1219#endif
1220
1221#ifndef USE_CMSG
1222	return;
1223#else
1224	if (msg->msg_controllen == 0U || msg->msg_control == NULL)
1225		return;
1226
1227#ifdef SO_TIMESTAMP
1228	timevalp = NULL;
1229#endif
1230#ifdef ISC_PLATFORM_HAVEIN6PKTINFO
1231	pktinfop = NULL;
1232#endif
1233
1234	cmsgp = CMSG_FIRSTHDR(msg);
1235	while (cmsgp != NULL) {
1236		socket_log(sock, NULL, TRACE,
1237			   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_PROCESSCMSG,
1238			   "processing cmsg %p", cmsgp);
1239
1240#ifdef ISC_PLATFORM_HAVEIN6PKTINFO
1241		if (cmsgp->cmsg_level == IPPROTO_IPV6
1242		    && cmsgp->cmsg_type == IPV6_PKTINFO) {
1243
1244			pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
1245			memcpy(&dev->pktinfo, pktinfop,
1246			       sizeof(struct in6_pktinfo));
1247			dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
1248			socket_log(sock, NULL, TRACE,
1249				   isc_msgcat, ISC_MSGSET_SOCKET,
1250				   ISC_MSG_IFRECEIVED,
1251				   "interface received on ifindex %u",
1252				   dev->pktinfo.ipi6_ifindex);
1253			if (IN6_IS_ADDR_MULTICAST(&pktinfop->ipi6_addr))
1254				dev->attributes |= ISC_SOCKEVENTATTR_MULTICAST;
1255			goto next;
1256		}
1257#endif
1258
1259#ifdef SO_TIMESTAMP
1260		if (cmsgp->cmsg_level == SOL_SOCKET
1261		    && cmsgp->cmsg_type == SCM_TIMESTAMP) {
1262			timevalp = (struct timeval *)CMSG_DATA(cmsgp);
1263			dev->timestamp.seconds = timevalp->tv_sec;
1264			dev->timestamp.nanoseconds = timevalp->tv_usec * 1000;
1265			dev->attributes |= ISC_SOCKEVENTATTR_TIMESTAMP;
1266			goto next;
1267		}
1268#endif
1269
1270	next:
1271		cmsgp = CMSG_NXTHDR(msg, cmsgp);
1272	}
1273#endif /* USE_CMSG */
1274
1275#endif /* ISC_NET_BSD44MSGHDR */
1276}
1277
1278/*
1279 * Construct an iov array and attach it to the msghdr passed in.  This is
1280 * the SEND constructor, which will use the used region of the buffer
1281 * (if using a buffer list) or will use the internal region (if a single
1282 * buffer I/O is requested).
1283 *
1284 * Nothing can be NULL, and the done event must list at least one buffer
1285 * on the buffer linked list for this function to be meaningful.
1286 *
1287 * If write_countp != NULL, *write_countp will hold the number of bytes
1288 * this transaction can send.
1289 */
1290static void
1291build_msghdr_send(isc__socket_t *sock, isc_socketevent_t *dev,
1292		  struct msghdr *msg, struct iovec *iov, size_t *write_countp)
1293{
1294	unsigned int iovcount;
1295	isc_buffer_t *buffer;
1296	isc_region_t used;
1297	size_t write_count;
1298	size_t skip_count;
1299
1300	memset(msg, 0, sizeof(*msg));
1301
1302	if (!sock->connected) {
1303		msg->msg_name = (void *)&dev->address.type.sa;
1304		msg->msg_namelen = dev->address.length;
1305	} else {
1306		msg->msg_name = NULL;
1307		msg->msg_namelen = 0;
1308	}
1309
1310	buffer = ISC_LIST_HEAD(dev->bufferlist);
1311	write_count = 0;
1312	iovcount = 0;
1313
1314	/*
1315	 * Single buffer I/O?  Skip what we've done so far in this region.
1316	 */
1317	if (buffer == NULL) {
1318		write_count = dev->region.length - dev->n;
1319		iov[0].iov_base = (void *)(dev->region.base + dev->n);
1320		iov[0].iov_len = write_count;
1321		iovcount = 1;
1322
1323		goto config;
1324	}
1325
1326	/*
1327	 * Multibuffer I/O.
1328	 * Skip the data in the buffer list that we have already written.
1329	 */
1330	skip_count = dev->n;
1331	while (buffer != NULL) {
1332		REQUIRE(ISC_BUFFER_VALID(buffer));
1333		if (skip_count < isc_buffer_usedlength(buffer))
1334			break;
1335		skip_count -= isc_buffer_usedlength(buffer);
1336		buffer = ISC_LIST_NEXT(buffer, link);
1337	}
1338
1339	while (buffer != NULL) {
1340		INSIST(iovcount < MAXSCATTERGATHER_SEND);
1341
1342		isc_buffer_usedregion(buffer, &used);
1343
1344		if (used.length > 0) {
1345			iov[iovcount].iov_base = (void *)(used.base
1346							  + skip_count);
1347			iov[iovcount].iov_len = used.length - skip_count;
1348			write_count += (used.length - skip_count);
1349			skip_count = 0;
1350			iovcount++;
1351		}
1352		buffer = ISC_LIST_NEXT(buffer, link);
1353	}
1354
1355	INSIST(skip_count == 0U);
1356
1357 config:
1358	msg->msg_iov = iov;
1359	msg->msg_iovlen = iovcount;
1360
1361#ifdef ISC_NET_BSD44MSGHDR
1362	msg->msg_control = NULL;
1363	msg->msg_controllen = 0;
1364	msg->msg_flags = 0;
1365#if defined(USE_CMSG) && defined(ISC_PLATFORM_HAVEIN6PKTINFO)
1366	if ((sock->type == isc_sockettype_udp)
1367	    && ((dev->attributes & ISC_SOCKEVENTATTR_PKTINFO) != 0)) {
1368#if defined(IPV6_USE_MIN_MTU)
1369		int use_min_mtu = 1;	/* -1, 0, 1 */
1370#endif
1371		struct cmsghdr *cmsgp;
1372		struct in6_pktinfo *pktinfop;
1373
1374		socket_log(sock, NULL, TRACE,
1375			   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_SENDTODATA,
1376			   "sendto pktinfo data, ifindex %u",
1377			   dev->pktinfo.ipi6_ifindex);
1378
1379		msg->msg_controllen = cmsg_space(sizeof(struct in6_pktinfo));
1380		INSIST(msg->msg_controllen <= sock->sendcmsgbuflen);
1381		msg->msg_control = (void *)sock->sendcmsgbuf;
1382
1383		cmsgp = (struct cmsghdr *)sock->sendcmsgbuf;
1384		cmsgp->cmsg_level = IPPROTO_IPV6;
1385		cmsgp->cmsg_type = IPV6_PKTINFO;
1386		cmsgp->cmsg_len = cmsg_len(sizeof(struct in6_pktinfo));
1387		pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
1388		memcpy(pktinfop, &dev->pktinfo, sizeof(struct in6_pktinfo));
1389#if defined(IPV6_USE_MIN_MTU)
1390		/*
1391		 * Set IPV6_USE_MIN_MTU as a per packet option as FreeBSD
1392		 * ignores setsockopt(IPV6_USE_MIN_MTU) when IPV6_PKTINFO
1393		 * is used.
1394		 */
1395		cmsgp = (struct cmsghdr *)(sock->sendcmsgbuf +
1396					   msg->msg_controllen);
1397		msg->msg_controllen += cmsg_space(sizeof(use_min_mtu));
1398		INSIST(msg->msg_controllen <= sock->sendcmsgbuflen);
1399
1400		cmsgp->cmsg_level = IPPROTO_IPV6;
1401		cmsgp->cmsg_type = IPV6_USE_MIN_MTU;
1402		cmsgp->cmsg_len = cmsg_len(sizeof(use_min_mtu));
1403		memcpy(CMSG_DATA(cmsgp), &use_min_mtu, sizeof(use_min_mtu));
1404#endif
1405	}
1406#endif /* USE_CMSG && ISC_PLATFORM_HAVEIPV6 */
1407#else /* ISC_NET_BSD44MSGHDR */
1408	msg->msg_accrights = NULL;
1409	msg->msg_accrightslen = 0;
1410#endif /* ISC_NET_BSD44MSGHDR */
1411
1412	if (write_countp != NULL)
1413		*write_countp = write_count;
1414}
1415
1416/*
1417 * Construct an iov array and attach it to the msghdr passed in.  This is
1418 * the RECV constructor, which will use the available region of the buffer
1419 * (if using a buffer list) or will use the internal region (if a single
1420 * buffer I/O is requested).
1421 *
1422 * Nothing can be NULL, and the done event must list at least one buffer
1423 * on the buffer linked list for this function to be meaningful.
1424 *
1425 * If read_countp != NULL, *read_countp will hold the number of bytes
1426 * this transaction can receive.
1427 */
1428static void
1429build_msghdr_recv(isc__socket_t *sock, isc_socketevent_t *dev,
1430		  struct msghdr *msg, struct iovec *iov, size_t *read_countp)
1431{
1432	unsigned int iovcount;
1433	isc_buffer_t *buffer;
1434	isc_region_t available;
1435	size_t read_count;
1436
1437	memset(msg, 0, sizeof(struct msghdr));
1438
1439	if (sock->type == isc_sockettype_udp) {
1440		memset(&dev->address, 0, sizeof(dev->address));
1441#ifdef BROKEN_RECVMSG
1442		if (sock->pf == AF_INET) {
1443			msg->msg_name = (void *)&dev->address.type.sin;
1444			msg->msg_namelen = sizeof(dev->address.type.sin6);
1445		} else if (sock->pf == AF_INET6) {
1446			msg->msg_name = (void *)&dev->address.type.sin6;
1447			msg->msg_namelen = sizeof(dev->address.type.sin6);
1448#ifdef ISC_PLATFORM_HAVESYSUNH
1449		} else if (sock->pf == AF_UNIX) {
1450			msg->msg_name = (void *)&dev->address.type.sunix;
1451			msg->msg_namelen = sizeof(dev->address.type.sunix);
1452#endif
1453		} else {
1454			msg->msg_name = (void *)&dev->address.type.sa;
1455			msg->msg_namelen = sizeof(dev->address.type);
1456		}
1457#else
1458		msg->msg_name = (void *)&dev->address.type.sa;
1459		msg->msg_namelen = sizeof(dev->address.type);
1460#endif
1461#ifdef ISC_NET_RECVOVERFLOW
1462		/* If needed, steal one iovec for overflow detection. */
1463		maxiov--;
1464#endif
1465	} else { /* TCP */
1466		msg->msg_name = NULL;
1467		msg->msg_namelen = 0;
1468		dev->address = sock->peer_address;
1469	}
1470
1471	buffer = ISC_LIST_HEAD(dev->bufferlist);
1472	read_count = 0;
1473
1474	/*
1475	 * Single buffer I/O?  Skip what we've done so far in this region.
1476	 */
1477	if (buffer == NULL) {
1478		read_count = dev->region.length - dev->n;
1479		iov[0].iov_base = (void *)(dev->region.base + dev->n);
1480		iov[0].iov_len = read_count;
1481		iovcount = 1;
1482
1483		goto config;
1484	}
1485
1486	/*
1487	 * Multibuffer I/O.
1488	 * Skip empty buffers.
1489	 */
1490	while (buffer != NULL) {
1491		REQUIRE(ISC_BUFFER_VALID(buffer));
1492		if (isc_buffer_availablelength(buffer) != 0)
1493			break;
1494		buffer = ISC_LIST_NEXT(buffer, link);
1495	}
1496
1497	iovcount = 0;
1498	while (buffer != NULL) {
1499		INSIST(iovcount < MAXSCATTERGATHER_RECV);
1500
1501		isc_buffer_availableregion(buffer, &available);
1502
1503		if (available.length > 0) {
1504			iov[iovcount].iov_base = (void *)(available.base);
1505			iov[iovcount].iov_len = available.length;
1506			read_count += available.length;
1507			iovcount++;
1508		}
1509		buffer = ISC_LIST_NEXT(buffer, link);
1510	}
1511
1512 config:
1513
1514	/*
1515	 * If needed, set up to receive that one extra byte.  Note that
1516	 * we know there is at least one iov left, since we stole it
1517	 * at the top of this function.
1518	 */
1519#ifdef ISC_NET_RECVOVERFLOW
1520	if (sock->type == isc_sockettype_udp) {
1521		iov[iovcount].iov_base = (void *)(&sock->overflow);
1522		iov[iovcount].iov_len = 1;
1523		iovcount++;
1524	}
1525#endif
1526
1527	msg->msg_iov = iov;
1528	msg->msg_iovlen = iovcount;
1529
1530#ifdef ISC_NET_BSD44MSGHDR
1531	msg->msg_control = NULL;
1532	msg->msg_controllen = 0;
1533	msg->msg_flags = 0;
1534#if defined(USE_CMSG)
1535	if (sock->type == isc_sockettype_udp) {
1536		msg->msg_control = sock->recvcmsgbuf;
1537		msg->msg_controllen = sock->recvcmsgbuflen;
1538	}
1539#endif /* USE_CMSG */
1540#else /* ISC_NET_BSD44MSGHDR */
1541	msg->msg_accrights = NULL;
1542	msg->msg_accrightslen = 0;
1543#endif /* ISC_NET_BSD44MSGHDR */
1544
1545	if (read_countp != NULL)
1546		*read_countp = read_count;
1547}
1548
1549static void
1550set_dev_address(isc_sockaddr_t *address, isc__socket_t *sock,
1551		isc_socketevent_t *dev)
1552{
1553	if (sock->type == isc_sockettype_udp) {
1554		if (address != NULL)
1555			dev->address = *address;
1556		else
1557			dev->address = sock->peer_address;
1558	} else if (sock->type == isc_sockettype_tcp) {
1559		INSIST(address == NULL);
1560		dev->address = sock->peer_address;
1561	}
1562}
1563
1564static void
1565destroy_socketevent(isc_event_t *event) {
1566	isc_socketevent_t *ev = (isc_socketevent_t *)event;
1567
1568	INSIST(ISC_LIST_EMPTY(ev->bufferlist));
1569
1570	(ev->destroy)(event);
1571}
1572
1573static isc_socketevent_t *
1574allocate_socketevent(isc__socket_t *sock, isc_eventtype_t eventtype,
1575		     isc_taskaction_t action, const void *arg)
1576{
1577	isc_socketevent_t *ev;
1578
1579	ev = (isc_socketevent_t *)isc_event_allocate(sock->manager->mctx,
1580						     sock, eventtype,
1581						     action, arg,
1582						     sizeof(*ev));
1583
1584	if (ev == NULL)
1585		return (NULL);
1586
1587	ev->result = ISC_R_UNSET;
1588	ISC_LINK_INIT(ev, ev_link);
1589	ISC_LIST_INIT(ev->bufferlist);
1590	ev->region.base = NULL;
1591	ev->n = 0;
1592	ev->offset = 0;
1593	ev->attributes = 0;
1594	ev->destroy = ev->ev_destroy;
1595	ev->ev_destroy = destroy_socketevent;
1596
1597	return (ev);
1598}
1599
1600#if defined(ISC_SOCKET_DEBUG)
1601static void
1602dump_msg(struct msghdr *msg) {
1603	unsigned int i;
1604
1605	printf("MSGHDR %p\n", msg);
1606	printf("\tname %p, namelen %ld\n", msg->msg_name,
1607	       (long) msg->msg_namelen);
1608	printf("\tiov %p, iovlen %ld\n", msg->msg_iov,
1609	       (long) msg->msg_iovlen);
1610	for (i = 0; i < (unsigned int)msg->msg_iovlen; i++)
1611		printf("\t\t%d\tbase %p, len %ld\n", i,
1612		       msg->msg_iov[i].iov_base,
1613		       (long) msg->msg_iov[i].iov_len);
1614#ifdef ISC_NET_BSD44MSGHDR
1615	printf("\tcontrol %p, controllen %ld\n", msg->msg_control,
1616	       (long) msg->msg_controllen);
1617#endif
1618}
1619#endif
1620
1621#define DOIO_SUCCESS		0	/* i/o ok, event sent */
1622#define DOIO_SOFT		1	/* i/o ok, soft error, no event sent */
1623#define DOIO_HARD		2	/* i/o error, event sent */
1624#define DOIO_EOF		3	/* EOF, no event sent */
1625
1626static int
1627doio_recv(isc__socket_t *sock, isc_socketevent_t *dev) {
1628	int cc;
1629	struct iovec iov[MAXSCATTERGATHER_RECV];
1630	size_t read_count;
1631	size_t actual_count;
1632	struct msghdr msghdr;
1633	isc_buffer_t *buffer;
1634	int recv_errno;
1635	char strbuf[ISC_STRERRORSIZE];
1636
1637	build_msghdr_recv(sock, dev, &msghdr, iov, &read_count);
1638
1639#if defined(ISC_SOCKET_DEBUG)
1640	dump_msg(&msghdr);
1641#endif
1642
1643	cc = recvmsg(sock->fd, &msghdr, 0);
1644	recv_errno = errno;
1645
1646#if defined(ISC_SOCKET_DEBUG)
1647	dump_msg(&msghdr);
1648#endif
1649
1650	if (cc < 0) {
1651		if (SOFT_ERROR(recv_errno))
1652			return (DOIO_SOFT);
1653
1654		if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
1655			isc__strerror(recv_errno, strbuf, sizeof(strbuf));
1656			socket_log(sock, NULL, IOEVENT,
1657				   isc_msgcat, ISC_MSGSET_SOCKET,
1658				   ISC_MSG_DOIORECV,
1659				  "doio_recv: recvmsg(%d) %d bytes, err %d/%s",
1660				   sock->fd, cc, recv_errno, strbuf);
1661		}
1662
1663#define SOFT_OR_HARD(_system, _isc) \
1664	if (recv_errno == _system) { \
1665		if (sock->connected) { \
1666			dev->result = _isc; \
1667			inc_stats(sock->manager->stats, \
1668				  sock->statsindex[STATID_RECVFAIL]); \
1669			return (DOIO_HARD); \
1670		} \
1671		return (DOIO_SOFT); \
1672	}
1673#define ALWAYS_HARD(_system, _isc) \
1674	if (recv_errno == _system) { \
1675		dev->result = _isc; \
1676		inc_stats(sock->manager->stats, \
1677			  sock->statsindex[STATID_RECVFAIL]); \
1678		return (DOIO_HARD); \
1679	}
1680
1681		SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
1682		SOFT_OR_HARD(ENETUNREACH, ISC_R_NETUNREACH);
1683		SOFT_OR_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
1684		SOFT_OR_HARD(EHOSTDOWN, ISC_R_HOSTDOWN);
1685		/* HPUX 11.11 can return EADDRNOTAVAIL. */
1686		SOFT_OR_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
1687		ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES);
1688		/*
1689		 * HPUX returns EPROTO and EINVAL on receiving some ICMP/ICMPv6
1690		 * errors.
1691		 */
1692#ifdef EPROTO
1693		SOFT_OR_HARD(EPROTO, ISC_R_HOSTUNREACH);
1694#endif
1695		SOFT_OR_HARD(EINVAL, ISC_R_HOSTUNREACH);
1696
1697#undef SOFT_OR_HARD
1698#undef ALWAYS_HARD
1699
1700		dev->result = isc__errno2result(recv_errno);
1701		inc_stats(sock->manager->stats,
1702			  sock->statsindex[STATID_RECVFAIL]);
1703		return (DOIO_HARD);
1704	}
1705
1706	/*
1707	 * On TCP and UNIX sockets, zero length reads indicate EOF,
1708	 * while on UDP sockets, zero length reads are perfectly valid,
1709	 * although strange.
1710	 */
1711	switch (sock->type) {
1712	case isc_sockettype_tcp:
1713	case isc_sockettype_unix:
1714		if (cc == 0)
1715			return (DOIO_EOF);
1716		break;
1717	case isc_sockettype_udp:
1718		break;
1719	case isc_sockettype_fdwatch:
1720	default:
1721		INSIST(0);
1722	}
1723
1724	if (sock->type == isc_sockettype_udp) {
1725		dev->address.length = msghdr.msg_namelen;
1726		if (isc_sockaddr_getport(&dev->address) == 0) {
1727			if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
1728				socket_log(sock, &dev->address, IOEVENT,
1729					   isc_msgcat, ISC_MSGSET_SOCKET,
1730					   ISC_MSG_ZEROPORT,
1731					   "dropping source port zero packet");
1732			}
1733			return (DOIO_SOFT);
1734		}
1735		/*
1736		 * Simulate a firewall blocking UDP responses bigger than
1737		 * 512 bytes.
1738		 */
1739		if (sock->manager->maxudp != 0 && cc > sock->manager->maxudp)
1740			return (DOIO_SOFT);
1741	}
1742
1743	socket_log(sock, &dev->address, IOEVENT,
1744		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_PKTRECV,
1745		   "packet received correctly");
1746
1747	/*
1748	 * Overflow bit detection.  If we received MORE bytes than we should,
1749	 * this indicates an overflow situation.  Set the flag in the
1750	 * dev entry and adjust how much we read by one.
1751	 */
1752#ifdef ISC_NET_RECVOVERFLOW
1753	if ((sock->type == isc_sockettype_udp) && ((size_t)cc > read_count)) {
1754		dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
1755		cc--;
1756	}
1757#endif
1758
1759	/*
1760	 * If there are control messages attached, run through them and pull
1761	 * out the interesting bits.
1762	 */
1763	if (sock->type == isc_sockettype_udp)
1764		process_cmsg(sock, &msghdr, dev);
1765
1766	/*
1767	 * update the buffers (if any) and the i/o count
1768	 */
1769	dev->n += cc;
1770	actual_count = cc;
1771	buffer = ISC_LIST_HEAD(dev->bufferlist);
1772	while (buffer != NULL && actual_count > 0U) {
1773		REQUIRE(ISC_BUFFER_VALID(buffer));
1774		if (isc_buffer_availablelength(buffer) <= actual_count) {
1775			actual_count -= isc_buffer_availablelength(buffer);
1776			isc_buffer_add(buffer,
1777				       isc_buffer_availablelength(buffer));
1778		} else {
1779			isc_buffer_add(buffer, actual_count);
1780			actual_count = 0;
1781			POST(actual_count);
1782			break;
1783		}
1784		buffer = ISC_LIST_NEXT(buffer, link);
1785		if (buffer == NULL) {
1786			INSIST(actual_count == 0U);
1787		}
1788	}
1789
1790	/*
1791	 * If we read less than we expected, update counters,
1792	 * and let the upper layer poke the descriptor.
1793	 */
1794	if (((size_t)cc != read_count) && (dev->n < dev->minimum))
1795		return (DOIO_SOFT);
1796
1797	/*
1798	 * Full reads are posted, or partials if partials are ok.
1799	 */
1800	dev->result = ISC_R_SUCCESS;
1801	return (DOIO_SUCCESS);
1802}
1803
1804/*
1805 * Returns:
1806 *	DOIO_SUCCESS	The operation succeeded.  dev->result contaiā€¦

Large files files are truncated, but you can click here to view the full file