PageRenderTime 283ms CodeModel.GetById 60ms app.highlight 185ms RepoModel.GetById 3ms app.codeStats 1ms

/src/libeio/eio.c

http://github.com/jacksonh/manos
C | 2051 lines | 1504 code | 421 blank | 126 comment | 236 complexity | 9f1d3e9a69f7be9ed4be6d8ca5e65637 MD5 | raw file

Large files files are truncated, but you can click here to view the full file

   1/*
   2 * libeio implementation
   3 *
   4 * Copyright (c) 2007,2008,2009,2010 Marc Alexander Lehmann <libeio@schmorp.de>
   5 * All rights reserved.
   6 *
   7 * Redistribution and use in source and binary forms, with or without modifica-
   8 * tion, are permitted provided that the following conditions are met:
   9 * 
  10 *   1.  Redistributions of source code must retain the above copyright notice,
  11 *       this list of conditions and the following disclaimer.
  12 * 
  13 *   2.  Redistributions in binary form must reproduce the above copyright
  14 *       notice, this list of conditions and the following disclaimer in the
  15 *       documentation and/or other materials provided with the distribution.
  16 * 
  17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
  18 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MER-
  19 * CHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO
  20 * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE-
  21 * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  22 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  23 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
  24 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTH-
  25 * ERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
  26 * OF THE POSSIBILITY OF SUCH DAMAGE.
  27 *
  28 * Alternatively, the contents of this file may be used under the terms of
  29 * the GNU General Public License ("GPL") version 2 or any later version,
  30 * in which case the provisions of the GPL are applicable instead of
  31 * the above. If you wish to allow the use of your version of this file
  32 * only under the terms of the GPL and not to allow others to use your
  33 * version of this file under the BSD license, indicate your decision
  34 * by deleting the provisions above and replace them with the notice
  35 * and other provisions required by the GPL. If you do not delete the
  36 * provisions above, a recipient may use your version of this file under
  37 * either the BSD or the GPL.
  38 */
  39
  40#include "eio.h"
  41
  42#ifdef EIO_STACKSIZE
  43# define XTHREAD_STACKSIZE EIO_STACKSIZE
  44#endif
  45#include "xthread.h"
  46
  47#include <errno.h>
  48#include <stddef.h>
  49#include <stdlib.h>
  50#include <string.h>
  51#include <errno.h>
  52#include <sys/types.h>
  53#include <sys/stat.h>
  54#include <sys/statvfs.h>
  55#include <limits.h>
  56#include <fcntl.h>
  57#include <assert.h>
  58
  59#ifndef EIO_FINISH
  60# define EIO_FINISH(req)  ((req)->finish) && !EIO_CANCELLED (req) ? (req)->finish (req) : 0
  61#endif
  62
  63#ifndef EIO_DESTROY
  64# define EIO_DESTROY(req) do { if ((req)->destroy) (req)->destroy (req); } while (0)
  65#endif
  66
  67#ifndef EIO_FEED
  68# define EIO_FEED(req)    do { if ((req)->feed   ) (req)->feed    (req); } while (0)
  69#endif
  70
  71#ifdef _WIN32
  72
  73  /*doh*/
  74#else
  75
  76# include "config.h"
  77# include <sys/time.h>
  78# include <sys/select.h>
  79# include <unistd.h>
  80# include <utime.h>
  81# include <signal.h>
  82# include <dirent.h>
  83
  84#if _POSIX_MEMLOCK || _POSIX_MEMLOCK_RANGE || _POSIX_MAPPED_FILES
  85# include <sys/mman.h>
  86#endif
  87
  88/* POSIX_SOURCE is useless on bsd's, and XOPEN_SOURCE is unreliable there, too */
  89# if __FreeBSD__ || defined __NetBSD__ || defined __OpenBSD__
  90#  define _DIRENT_HAVE_D_TYPE /* sigh */
  91#  define D_INO(de) (de)->d_fileno
  92#  define D_NAMLEN(de) (de)->d_namlen
  93# elif __linux || defined d_ino || _XOPEN_SOURCE >= 600
  94#  define D_INO(de) (de)->d_ino
  95# endif
  96
  97#ifdef _D_EXACT_NAMLEN
  98# undef D_NAMLEN
  99# define D_NAMLEN(de) _D_EXACT_NAMLEN (de)
 100#endif
 101
 102# ifdef _DIRENT_HAVE_D_TYPE
 103#  define D_TYPE(de) (de)->d_type
 104# endif
 105
 106# ifndef EIO_STRUCT_DIRENT
 107#  define EIO_STRUCT_DIRENT struct dirent
 108# endif
 109
 110#endif
 111
 112#if HAVE_SENDFILE
 113# if __linux
 114#  include <sys/sendfile.h>
 115# elif __FreeBSD__ || defined __APPLE__
 116#  include <sys/socket.h>
 117#  include <sys/uio.h>
 118# elif __hpux
 119#  include <sys/socket.h>
 120# elif __solaris
 121#  include <sys/sendfile.h>
 122# else
 123#  error sendfile support requested but not available
 124# endif
 125#endif
 126
 127#ifndef D_TYPE
 128# define D_TYPE(de) 0
 129#endif
 130#ifndef D_INO
 131# define D_INO(de) 0
 132#endif
 133#ifndef D_NAMLEN
 134# define D_NAMLEN(de) strlen ((de)->d_name)
 135#endif
 136
 137/* number of seconds after which an idle threads exit */
 138#define IDLE_TIMEOUT 10
 139
 140/* used for struct dirent, AIX doesn't provide it */
 141#ifndef NAME_MAX
 142# define NAME_MAX 4096
 143#endif
 144
 145/* used for readlink etc. */
 146#ifndef PATH_MAX
 147# define PATH_MAX 4096
 148#endif
 149
 150/* buffer size for various temporary buffers */
 151#define EIO_BUFSIZE 65536
 152
 153#define dBUF	 				\
 154  char *eio_buf;				\
 155  ETP_WORKER_LOCK (self);			\
 156  self->dbuf = eio_buf = malloc (EIO_BUFSIZE);	\
 157  ETP_WORKER_UNLOCK (self);			\
 158  errno = ENOMEM;				\
 159  if (!eio_buf)					\
 160    return -1;
 161
 162#define EIO_TICKS ((1000000 + 1023) >> 10)
 163
 164/*****************************************************************************/
 165
 166#if __GNUC__ >= 3
 167# define expect(expr,value) __builtin_expect ((expr),(value))
 168#else
 169# define expect(expr,value) (expr)
 170#endif
 171
 172#define expect_false(expr) expect ((expr) != 0, 0)
 173#define expect_true(expr)  expect ((expr) != 0, 1)
 174
 175/*****************************************************************************/
 176
 177#define ETP_PRI_MIN EIO_PRI_MIN
 178#define ETP_PRI_MAX EIO_PRI_MAX
 179
 180struct etp_worker;
 181
 182#define ETP_REQ eio_req
 183#define ETP_DESTROY(req) eio_destroy (req)
 184static int eio_finish (eio_req *req);
 185#define ETP_FINISH(req)  eio_finish (req)
 186static void eio_execute (struct etp_worker *self, eio_req *req);
 187#define ETP_EXECUTE(wrk,req) eio_execute (wrk,req)
 188
 189#define ETP_WORKER_CLEAR(req)	\
 190  if (wrk->dbuf)		\
 191    {				\
 192      free (wrk->dbuf);		\
 193      wrk->dbuf = 0;		\
 194    }				\
 195				\
 196  if (wrk->dirp)		\
 197    {				\
 198      closedir (wrk->dirp);	\
 199      wrk->dirp = 0;		\
 200    }
 201
 202#define ETP_WORKER_COMMON \
 203  void *dbuf;	\
 204  DIR *dirp;
 205
 206/*****************************************************************************/
 207
 208#define ETP_NUM_PRI (ETP_PRI_MAX - ETP_PRI_MIN + 1)
 209
 210/* calculate time difference in ~1/EIO_TICKS of a second */
 211static int tvdiff (struct timeval *tv1, struct timeval *tv2)
 212{
 213  return  (tv2->tv_sec  - tv1->tv_sec ) * EIO_TICKS
 214       + ((tv2->tv_usec - tv1->tv_usec) >> 10);
 215}
 216
 217static unsigned int started, idle, wanted = 4;
 218
 219static void (*want_poll_cb) (void);
 220static void (*done_poll_cb) (void);
 221 
 222static unsigned int max_poll_time;     /* reslock */
 223static unsigned int max_poll_reqs;     /* reslock */
 224
 225static volatile unsigned int nreqs;    /* reqlock */
 226static volatile unsigned int nready;   /* reqlock */
 227static volatile unsigned int npending; /* reqlock */
 228static volatile unsigned int max_idle = 4;
 229
 230static xmutex_t wrklock = X_MUTEX_INIT;
 231static xmutex_t reslock = X_MUTEX_INIT;
 232static xmutex_t reqlock = X_MUTEX_INIT;
 233static xcond_t  reqwait = X_COND_INIT;
 234
 235#if !HAVE_PREADWRITE
 236/*
 237 * make our pread/pwrite emulation safe against themselves, but not against
 238 * normal read/write by using a mutex. slows down execution a lot,
 239 * but that's your problem, not mine.
 240 */
 241static xmutex_t preadwritelock = X_MUTEX_INIT;
 242#endif
 243
 244typedef struct etp_worker
 245{
 246  /* locked by wrklock */
 247  struct etp_worker *prev, *next;
 248
 249  xthread_t tid;
 250
 251  /* locked by reslock, reqlock or wrklock */
 252  ETP_REQ *req; /* currently processed request */
 253
 254  ETP_WORKER_COMMON
 255} etp_worker;
 256
 257static etp_worker wrk_first = { &wrk_first, &wrk_first, 0 }; /* NOT etp */
 258
 259#define ETP_WORKER_LOCK(wrk)   X_LOCK   (wrklock)
 260#define ETP_WORKER_UNLOCK(wrk) X_UNLOCK (wrklock)
 261
 262/* worker threads management */
 263
 264static void etp_worker_clear (etp_worker *wrk)
 265{
 266  ETP_WORKER_CLEAR (wrk);
 267}
 268
 269static void etp_worker_free (etp_worker *wrk)
 270{
 271  wrk->next->prev = wrk->prev;
 272  wrk->prev->next = wrk->next;
 273
 274  free (wrk);
 275}
 276
 277static unsigned int etp_nreqs (void)
 278{
 279  int retval;
 280  if (WORDACCESS_UNSAFE) X_LOCK   (reqlock);
 281  retval = nreqs;
 282  if (WORDACCESS_UNSAFE) X_UNLOCK (reqlock);
 283  return retval;
 284}
 285
 286static unsigned int etp_nready (void)
 287{
 288  unsigned int retval;
 289
 290  if (WORDACCESS_UNSAFE) X_LOCK   (reqlock);
 291  retval = nready;
 292  if (WORDACCESS_UNSAFE) X_UNLOCK (reqlock);
 293
 294  return retval;
 295}
 296
 297static unsigned int etp_npending (void)
 298{
 299  unsigned int retval;
 300
 301  if (WORDACCESS_UNSAFE) X_LOCK   (reqlock);
 302  retval = npending;
 303  if (WORDACCESS_UNSAFE) X_UNLOCK (reqlock);
 304
 305  return retval;
 306}
 307
 308static unsigned int etp_nthreads (void)
 309{
 310  unsigned int retval;
 311
 312  if (WORDACCESS_UNSAFE) X_LOCK   (reqlock);
 313  retval = started;
 314  if (WORDACCESS_UNSAFE) X_UNLOCK (reqlock);
 315
 316  return retval;
 317}
 318
 319/*
 320 * a somewhat faster data structure might be nice, but
 321 * with 8 priorities this actually needs <20 insns
 322 * per shift, the most expensive operation.
 323 */
 324typedef struct {
 325  ETP_REQ *qs[ETP_NUM_PRI], *qe[ETP_NUM_PRI]; /* qstart, qend */
 326  int size;
 327} etp_reqq;
 328
 329static etp_reqq req_queue;
 330static etp_reqq res_queue;
 331
 332static int reqq_push (etp_reqq *q, ETP_REQ *req)
 333{
 334  int pri = req->pri;
 335  req->next = 0;
 336
 337  if (q->qe[pri])
 338    {
 339      q->qe[pri]->next = req;
 340      q->qe[pri] = req;
 341    }
 342  else
 343    q->qe[pri] = q->qs[pri] = req;
 344
 345  return q->size++;
 346}
 347
 348static ETP_REQ *reqq_shift (etp_reqq *q)
 349{
 350  int pri;
 351
 352  if (!q->size)
 353    return 0;
 354
 355  --q->size;
 356
 357  for (pri = ETP_NUM_PRI; pri--; )
 358    {
 359      eio_req *req = q->qs[pri];
 360
 361      if (req)
 362        {
 363          if (!(q->qs[pri] = (eio_req *)req->next))
 364            q->qe[pri] = 0;
 365
 366          return req;
 367        }
 368    }
 369
 370  abort ();
 371}
 372
 373static void etp_atfork_prepare (void)
 374{
 375  X_LOCK (wrklock);
 376  X_LOCK (reqlock);
 377  X_LOCK (reslock);
 378#if !HAVE_PREADWRITE
 379  X_LOCK (preadwritelock);
 380#endif
 381}
 382
 383static void etp_atfork_parent (void)
 384{
 385#if !HAVE_PREADWRITE
 386  X_UNLOCK (preadwritelock);
 387#endif
 388  X_UNLOCK (reslock);
 389  X_UNLOCK (reqlock);
 390  X_UNLOCK (wrklock);
 391}
 392
 393static void etp_atfork_child (void)
 394{
 395  ETP_REQ *prv;
 396
 397  while ((prv = reqq_shift (&req_queue)))
 398    ETP_DESTROY (prv);
 399
 400  while ((prv = reqq_shift (&res_queue)))
 401    ETP_DESTROY (prv);
 402
 403  while (wrk_first.next != &wrk_first)
 404    {
 405      etp_worker *wrk = wrk_first.next;
 406
 407      if (wrk->req)
 408        ETP_DESTROY (wrk->req);
 409
 410      etp_worker_clear (wrk);
 411      etp_worker_free (wrk);
 412    }
 413
 414  started  = 0;
 415  idle     = 0;
 416  nreqs    = 0;
 417  nready   = 0;
 418  npending = 0;
 419
 420  etp_atfork_parent ();
 421}
 422
 423static void
 424etp_once_init (void)
 425{    
 426  X_THREAD_ATFORK (etp_atfork_prepare, etp_atfork_parent, etp_atfork_child);
 427}
 428
 429static int
 430etp_init (void (*want_poll)(void), void (*done_poll)(void))
 431{
 432  static pthread_once_t doinit = PTHREAD_ONCE_INIT;
 433
 434  pthread_once (&doinit, etp_once_init);
 435
 436  want_poll_cb = want_poll;
 437  done_poll_cb = done_poll;
 438
 439  return 0;
 440}
 441
 442X_THREAD_PROC (etp_proc);
 443
 444static void etp_start_thread (void)
 445{
 446  etp_worker *wrk = calloc (1, sizeof (etp_worker));
 447
 448  /*TODO*/
 449  assert (("unable to allocate worker thread data", wrk));
 450
 451  X_LOCK (wrklock);
 452
 453  if (thread_create (&wrk->tid, etp_proc, (void *)wrk))
 454    {
 455      wrk->prev = &wrk_first;
 456      wrk->next = wrk_first.next;
 457      wrk_first.next->prev = wrk;
 458      wrk_first.next = wrk;
 459      ++started;
 460    }
 461  else
 462    free (wrk);
 463
 464  X_UNLOCK (wrklock);
 465}
 466
 467static void etp_maybe_start_thread (void)
 468{
 469  if (expect_true (etp_nthreads () >= wanted))
 470    return;
 471  
 472  /* todo: maybe use idle here, but might be less exact */
 473  if (expect_true (0 <= (int)etp_nthreads () + (int)etp_npending () - (int)etp_nreqs ()))
 474    return;
 475
 476  etp_start_thread ();
 477}
 478
 479static void etp_end_thread (void)
 480{
 481  eio_req *req = calloc (1, sizeof (eio_req));
 482
 483  req->type = -1;
 484  req->pri  = ETP_PRI_MAX - ETP_PRI_MIN;
 485
 486  X_LOCK (reqlock);
 487  reqq_push (&req_queue, req);
 488  X_COND_SIGNAL (reqwait);
 489  X_UNLOCK (reqlock);
 490
 491  X_LOCK (wrklock);
 492  --started;
 493  X_UNLOCK (wrklock);
 494}
 495
 496static int etp_poll (void)
 497{
 498  unsigned int maxreqs;
 499  unsigned int maxtime;
 500  struct timeval tv_start, tv_now;
 501
 502  X_LOCK (reslock);
 503  maxreqs = max_poll_reqs;
 504  maxtime = max_poll_time;
 505  X_UNLOCK (reslock);
 506
 507  if (maxtime)
 508    gettimeofday (&tv_start, 0);
 509
 510  for (;;)
 511    {
 512      ETP_REQ *req;
 513
 514      etp_maybe_start_thread ();
 515
 516      X_LOCK (reslock);
 517      req = reqq_shift (&res_queue);
 518
 519      if (req)
 520        {
 521          --npending;
 522
 523          if (!res_queue.size && done_poll_cb)
 524            done_poll_cb ();
 525        }
 526
 527      X_UNLOCK (reslock);
 528
 529      if (!req)
 530        return 0;
 531
 532      X_LOCK (reqlock);
 533      --nreqs;
 534      X_UNLOCK (reqlock);
 535
 536      if (expect_false (req->type == EIO_GROUP && req->size))
 537        {
 538          req->int1 = 1; /* mark request as delayed */
 539          continue;
 540        }
 541      else
 542        {
 543          int res = ETP_FINISH (req);
 544          if (expect_false (res))
 545            return res;
 546        }
 547
 548      if (expect_false (maxreqs && !--maxreqs))
 549        break;
 550
 551      if (maxtime)
 552        {
 553          gettimeofday (&tv_now, 0);
 554
 555          if (tvdiff (&tv_start, &tv_now) >= maxtime)
 556            break;
 557        }
 558    }
 559
 560  errno = EAGAIN;
 561  return -1;
 562}
 563
 564static void etp_cancel (ETP_REQ *req)
 565{
 566  X_LOCK   (wrklock);
 567  req->flags |= EIO_FLAG_CANCELLED;
 568  X_UNLOCK (wrklock);
 569
 570  eio_grp_cancel (req);
 571}
 572
 573static void etp_submit (ETP_REQ *req)
 574{
 575  req->pri -= ETP_PRI_MIN;
 576
 577  if (expect_false (req->pri < ETP_PRI_MIN - ETP_PRI_MIN)) req->pri = ETP_PRI_MIN - ETP_PRI_MIN;
 578  if (expect_false (req->pri > ETP_PRI_MAX - ETP_PRI_MIN)) req->pri = ETP_PRI_MAX - ETP_PRI_MIN;
 579
 580  if (expect_false (req->type == EIO_GROUP))
 581    {
 582      /* I hope this is worth it :/ */
 583      X_LOCK (reqlock);
 584      ++nreqs;
 585      X_UNLOCK (reqlock);
 586
 587      X_LOCK (reslock);
 588
 589      ++npending;
 590
 591      if (!reqq_push (&res_queue, req) && want_poll_cb)
 592        want_poll_cb ();
 593
 594      X_UNLOCK (reslock);
 595    }
 596  else
 597    {
 598      X_LOCK (reqlock);
 599      ++nreqs;
 600      ++nready;
 601      reqq_push (&req_queue, req);
 602      X_COND_SIGNAL (reqwait);
 603      X_UNLOCK (reqlock);
 604
 605      etp_maybe_start_thread ();
 606    }
 607}
 608
 609static void etp_set_max_poll_time (double nseconds)
 610{
 611  if (WORDACCESS_UNSAFE) X_LOCK   (reslock);
 612  max_poll_time = nseconds * EIO_TICKS;
 613  if (WORDACCESS_UNSAFE) X_UNLOCK (reslock);
 614}
 615
 616static void etp_set_max_poll_reqs (unsigned int maxreqs)
 617{
 618  if (WORDACCESS_UNSAFE) X_LOCK   (reslock);
 619  max_poll_reqs = maxreqs;
 620  if (WORDACCESS_UNSAFE) X_UNLOCK (reslock);
 621}
 622
 623static void etp_set_max_idle (unsigned int nthreads)
 624{
 625  if (WORDACCESS_UNSAFE) X_LOCK   (reqlock);
 626  max_idle = nthreads <= 0 ? 1 : nthreads;
 627  if (WORDACCESS_UNSAFE) X_UNLOCK (reqlock);
 628}
 629
 630static void etp_set_min_parallel (unsigned int nthreads)
 631{
 632  if (wanted < nthreads)
 633    wanted = nthreads;
 634}
 635
 636static void etp_set_max_parallel (unsigned int nthreads)
 637{
 638  if (wanted > nthreads)
 639    wanted = nthreads;
 640
 641  while (started > wanted)
 642    etp_end_thread ();
 643}
 644
 645/*****************************************************************************/
 646
 647static void grp_try_feed (eio_req *grp)
 648{
 649  while (grp->size < grp->int2 && !EIO_CANCELLED (grp))
 650    {
 651      grp->flags &= ~EIO_FLAG_GROUPADD;
 652
 653      EIO_FEED (grp);
 654
 655      /* stop if no progress has been made */
 656      if (!(grp->flags & EIO_FLAG_GROUPADD))
 657        {
 658          grp->feed = 0;
 659          break;
 660        }
 661    }
 662}
 663
 664static int grp_dec (eio_req *grp)
 665{
 666  --grp->size;
 667
 668  /* call feeder, if applicable */
 669  grp_try_feed (grp);
 670
 671  /* finish, if done */
 672  if (!grp->size && grp->int1)
 673    return eio_finish (grp);
 674  else
 675    return 0;
 676}
 677
 678void eio_destroy (eio_req *req)
 679{
 680  if ((req)->flags & EIO_FLAG_PTR1_FREE) free (req->ptr1);
 681  if ((req)->flags & EIO_FLAG_PTR2_FREE) free (req->ptr2);
 682
 683  EIO_DESTROY (req);
 684}
 685
 686static int eio_finish (eio_req *req)
 687{
 688  int res = EIO_FINISH (req);
 689
 690  if (req->grp)
 691    {
 692      int res2;
 693      eio_req *grp = req->grp;
 694
 695      /* unlink request */
 696      if (req->grp_next) req->grp_next->grp_prev = req->grp_prev;
 697      if (req->grp_prev) req->grp_prev->grp_next = req->grp_next;
 698
 699      if (grp->grp_first == req)
 700        grp->grp_first = req->grp_next;
 701
 702      res2 = grp_dec (grp);
 703
 704      if (!res && res2)
 705        res = res2;
 706    }
 707
 708  eio_destroy (req);
 709
 710  return res;
 711}
 712
 713void eio_grp_cancel (eio_req *grp)
 714{
 715  for (grp = grp->grp_first; grp; grp = grp->grp_next)
 716    eio_cancel (grp);
 717}
 718
 719void eio_cancel (eio_req *req)
 720{
 721  etp_cancel (req);
 722}
 723
 724void eio_submit (eio_req *req)
 725{
 726  etp_submit (req);
 727}
 728
 729unsigned int eio_nreqs (void)
 730{
 731  return etp_nreqs ();
 732}
 733
 734unsigned int eio_nready (void)
 735{
 736  return etp_nready ();
 737}
 738
 739unsigned int eio_npending (void)
 740{
 741  return etp_npending ();
 742}
 743
 744unsigned int eio_nthreads (void)
 745{
 746  return etp_nthreads ();
 747}
 748
 749void eio_set_max_poll_time (double nseconds)
 750{
 751  etp_set_max_poll_time (nseconds);
 752}
 753
 754void eio_set_max_poll_reqs (unsigned int maxreqs)
 755{
 756  etp_set_max_poll_reqs (maxreqs);
 757}
 758
 759void eio_set_max_idle (unsigned int nthreads)
 760{
 761  etp_set_max_idle (nthreads);
 762}
 763
 764void eio_set_min_parallel (unsigned int nthreads)
 765{
 766  etp_set_min_parallel (nthreads);
 767}
 768
 769void eio_set_max_parallel (unsigned int nthreads)
 770{
 771  etp_set_max_parallel (nthreads);
 772}
 773
 774int eio_poll (void)
 775{
 776  return etp_poll ();
 777}
 778
 779/*****************************************************************************/
 780/* work around various missing functions */
 781
 782#if !HAVE_PREADWRITE
 783# undef pread
 784# undef pwrite
 785# define pread  eio__pread
 786# define pwrite eio__pwrite
 787
 788static ssize_t
 789eio__pread (int fd, void *buf, size_t count, off_t offset)
 790{
 791  ssize_t res;
 792  off_t ooffset;
 793
 794  X_LOCK (preadwritelock);
 795  ooffset = lseek (fd, 0, SEEK_CUR);
 796  lseek (fd, offset, SEEK_SET);
 797  res = read (fd, buf, count);
 798  lseek (fd, ooffset, SEEK_SET);
 799  X_UNLOCK (preadwritelock);
 800
 801  return res;
 802}
 803
 804static ssize_t
 805eio__pwrite (int fd, void *buf, size_t count, off_t offset)
 806{
 807  ssize_t res;
 808  off_t ooffset;
 809
 810  X_LOCK (preadwritelock);
 811  ooffset = lseek (fd, 0, SEEK_CUR);
 812  lseek (fd, offset, SEEK_SET);
 813  res = write (fd, buf, count);
 814  lseek (fd, ooffset, SEEK_SET);
 815  X_UNLOCK (preadwritelock);
 816
 817  return res;
 818}
 819#endif
 820
 821#ifndef HAVE_UTIMES
 822
 823# undef utimes
 824# define utimes(path,times)  eio__utimes (path, times)
 825
 826static int
 827eio__utimes (const char *filename, const struct timeval times[2])
 828{
 829  if (times)
 830    {
 831      struct utimbuf buf;
 832
 833      buf.actime  = times[0].tv_sec;
 834      buf.modtime = times[1].tv_sec;
 835
 836      return utime (filename, &buf);
 837    }
 838  else
 839    return utime (filename, 0);
 840}
 841
 842#endif
 843
 844#ifndef HAVE_FUTIMES
 845
 846# undef futimes
 847# define futimes(fd,times) eio__futimes (fd, times)
 848
 849static int eio__futimes (int fd, const struct timeval tv[2])
 850{
 851  errno = ENOSYS;
 852  return -1;
 853}
 854
 855#endif
 856
 857#if !HAVE_FDATASYNC
 858# undef fdatasync
 859# define fdatasync(fd) fsync (fd)
 860#endif
 861
 862/* sync_file_range always needs emulation */
 863int
 864eio__sync_file_range (int fd, off_t offset, size_t nbytes, unsigned int flags)
 865{
 866#if HAVE_SYNC_FILE_RANGE
 867  int res;
 868
 869  if (EIO_SYNC_FILE_RANGE_WAIT_BEFORE   != SYNC_FILE_RANGE_WAIT_BEFORE
 870      || EIO_SYNC_FILE_RANGE_WRITE      != SYNC_FILE_RANGE_WRITE
 871      || EIO_SYNC_FILE_RANGE_WAIT_AFTER != SYNC_FILE_RANGE_WAIT_AFTER)
 872    {
 873      flags = 0
 874         | (flags & EIO_SYNC_FILE_RANGE_WAIT_BEFORE ? SYNC_FILE_RANGE_WAIT_BEFORE : 0)
 875         | (flags & EIO_SYNC_FILE_RANGE_WRITE       ? SYNC_FILE_RANGE_WRITE       : 0)
 876         | (flags & EIO_SYNC_FILE_RANGE_WAIT_AFTER  ? SYNC_FILE_RANGE_WAIT_AFTER  : 0);
 877    }
 878
 879  res = sync_file_range (fd, offset, nbytes, flags);
 880
 881  if (!res || errno != ENOSYS)
 882    return res;
 883#endif
 884
 885  /* even though we could play tricks with the flags, it's better to always
 886   * call fdatasync, as that matches the expectation of its users best */
 887  return fdatasync (fd);
 888}
 889
 890#if !HAVE_READAHEAD
 891# undef readahead
 892# define readahead(fd,offset,count) eio__readahead (fd, offset, count, self)
 893
 894static ssize_t
 895eio__readahead (int fd, off_t offset, size_t count, etp_worker *self)
 896{
 897  size_t todo = count;
 898  dBUF;
 899
 900  while (todo > 0)
 901    {
 902      size_t len = todo < EIO_BUFSIZE ? todo : EIO_BUFSIZE;
 903
 904      pread (fd, eio_buf, len, offset);
 905      offset += len;
 906      todo   -= len;
 907    }
 908
 909  errno = 0;
 910  return count;
 911}
 912
 913#endif
 914
 915/* sendfile always needs emulation */
 916static ssize_t
 917eio__sendfile (int ofd, int ifd, off_t offset, size_t count, etp_worker *self)
 918{
 919  ssize_t res;
 920
 921  if (!count)
 922    return 0;
 923
 924#if HAVE_SENDFILE
 925# if __linux
 926  res = sendfile (ofd, ifd, &offset, count);
 927
 928# elif __FreeBSD__
 929  /*
 930   * Of course, the freebsd sendfile is a dire hack with no thoughts
 931   * wasted on making it similar to other I/O functions.
 932   */
 933  {
 934    off_t sbytes;
 935    res = sendfile (ifd, ofd, offset, count, 0, &sbytes, 0);
 936
 937    #if 0 /* according to the manpage, this is correct, but broken behaviour */
 938    /* freebsd' sendfile will return 0 on success */
 939    /* freebsd 8 documents it as only setting *sbytes on EINTR and EAGAIN, but */
 940    /* not on e.g. EIO or EPIPE - sounds broken */
 941    if ((res < 0 && (errno == EAGAIN || errno == EINTR) && sbytes) || res == 0)
 942      res = sbytes;
 943    #endif
 944
 945    /* according to source inspection, this is correct, and useful behaviour */
 946    if (sbytes)
 947      res = sbytes;
 948  }
 949
 950# elif defined (__APPLE__)
 951
 952  {
 953    off_t sbytes = count;
 954    res = sendfile (ifd, ofd, offset, &sbytes, 0, 0);
 955
 956    /* according to the manpage, sbytes is always valid */
 957    if (sbytes)
 958      res = sbytes;
 959  }
 960
 961# elif __hpux
 962  res = sendfile (ofd, ifd, offset, count, 0, 0);
 963
 964# elif __solaris
 965  {
 966    struct sendfilevec vec;
 967    size_t sbytes;
 968
 969    vec.sfv_fd   = ifd;
 970    vec.sfv_flag = 0;
 971    vec.sfv_off  = offset;
 972    vec.sfv_len  = count;
 973
 974    res = sendfilev (ofd, &vec, 1, &sbytes);
 975
 976    if (res < 0 && sbytes)
 977      res = sbytes;
 978  }
 979
 980# endif
 981
 982#elif defined (_WIN32)
 983
 984  /* does not work, just for documentation of what would need to be done */
 985  {
 986    HANDLE h = TO_SOCKET (ifd);
 987    SetFilePointer (h, offset, 0, FILE_BEGIN);
 988    res = TransmitFile (TO_SOCKET (ofd), h, count, 0, 0, 0, 0);
 989  }
 990
 991#else
 992  res = -1;
 993  errno = ENOSYS;
 994#endif
 995
 996  if (res <  0
 997      && (errno == ENOSYS || errno == EINVAL || errno == ENOTSOCK
 998          /* BSDs */
 999#ifdef ENOTSUP /* sigh, if the steenking pile called openbsd would only try to at least compile posix code... */
1000          || errno == ENOTSUP
1001#endif
1002          || errno == EOPNOTSUPP /* BSDs */
1003#if __solaris
1004          || errno == EAFNOSUPPORT || errno == EPROTOTYPE
1005#endif
1006         )
1007      )
1008    {
1009      /* emulate sendfile. this is a major pain in the ass */
1010      dBUF;
1011
1012      res = 0;
1013
1014      while (count)
1015        {
1016          ssize_t cnt;
1017          
1018          cnt = pread (ifd, eio_buf, count > EIO_BUFSIZE ? EIO_BUFSIZE : count, offset);
1019
1020          if (cnt <= 0)
1021            {
1022              if (cnt && !res) res = -1;
1023              break;
1024            }
1025
1026          cnt = write (ofd, eio_buf, cnt);
1027
1028          if (cnt <= 0)
1029            {
1030              if (cnt && !res) res = -1;
1031              break;
1032            }
1033
1034          offset += cnt;
1035          res    += cnt;
1036          count  -= cnt;
1037        }
1038    }
1039
1040  return res;
1041}
1042
1043static signed char
1044eio_dent_cmp (const eio_dirent *a, const eio_dirent *b)
1045{
1046    return a->score - b->score ? a->score - b->score /* works because our signed char is always 0..100 */
1047              : a->inode < b->inode ? -1 : a->inode > b->inode ? 1 : 0;
1048}
1049
1050#define EIO_DENT_CMP(i,op,j) eio_dent_cmp (&i, &j) op 0
1051
1052#define EIO_SORT_CUTOFF 30 /* quite high, but performs well on many filesystems */
1053#define EIO_SORT_FAST   60 /* when to only use insertion sort */
1054
1055static void
1056eio_dent_radix_sort (eio_dirent *dents, int size, signed char score_bits, ino_t inode_bits)
1057{
1058  unsigned char bits [9 + sizeof (ino_t) * 8];
1059  unsigned char *bit = bits;
1060
1061  assert (CHAR_BIT == 8);
1062  assert (sizeof (eio_dirent) * 8 < 256);
1063  assert (offsetof (eio_dirent, inode)); /* we use 0 as sentinel */
1064  assert (offsetof (eio_dirent, score)); /* we use 0 as sentinel */
1065
1066  if (size <= EIO_SORT_FAST)
1067    return;
1068
1069  /* first prepare an array of bits to test in our radix sort */
1070  /* try to take endianness into account, as well as differences in ino_t sizes */
1071  /* inode_bits must contain all inodes ORed together */
1072  /* which is used to skip bits that are 0 everywhere, which is very common */
1073  {
1074    ino_t endianness;
1075    int i, j;
1076
1077    /* we store the byte offset of byte n into byte n of "endianness" */
1078    for (i = 0; i < sizeof (ino_t); ++i)
1079      ((unsigned char *)&endianness)[i] = i;
1080
1081    *bit++ = 0;
1082
1083    for (i = 0; i < sizeof (ino_t); ++i)
1084      {
1085        /* shifting off the byte offsets out of "endianness" */
1086        int offs = (offsetof (eio_dirent, inode) + (endianness & 0xff)) * 8;
1087        endianness >>= 8;
1088
1089        for (j = 0; j < 8; ++j)
1090          if (inode_bits & (((ino_t)1) << (i * 8 + j)))
1091            *bit++ = offs + j;
1092      }
1093
1094    for (j = 0; j < 8; ++j)
1095      if (score_bits & (1 << j))
1096        *bit++ = offsetof (eio_dirent, score) * 8 + j;
1097  }
1098
1099  /* now actually do the sorting (a variant of MSD radix sort) */
1100  {
1101    eio_dirent    *base_stk [9 + sizeof (ino_t) * 8], *base;
1102    eio_dirent    *end_stk  [9 + sizeof (ino_t) * 8], *end;
1103    unsigned char *bit_stk  [9 + sizeof (ino_t) * 8];
1104    int stk_idx = 0;
1105
1106    base_stk [stk_idx] = dents;
1107    end_stk  [stk_idx] = dents + size;
1108    bit_stk  [stk_idx] = bit - 1;
1109
1110    do
1111      {
1112        base = base_stk [stk_idx];
1113        end  = end_stk  [stk_idx];
1114        bit  = bit_stk  [stk_idx];
1115
1116        for (;;)
1117          {
1118            unsigned char O = *bit >> 3;
1119            unsigned char M = 1 << (*bit & 7);
1120
1121            eio_dirent *a = base;
1122            eio_dirent *b = end;
1123
1124            if (b - a < EIO_SORT_CUTOFF)
1125              break;
1126
1127            /* now bit-partition the array on the bit */
1128            /* this ugly asymmetric loop seems to perform much better than typical */
1129            /* partition algos found in the literature */
1130            do
1131              if (!(((unsigned char *)a)[O] & M))
1132                ++a;
1133              else if (!(((unsigned char *)--b)[O] & M))
1134                {
1135                  eio_dirent tmp = *a; *a = *b; *b = tmp;
1136                  ++a;
1137                }
1138            while (b > a);
1139
1140            /* next bit, or stop, if no bits left in this path */
1141            if (!*--bit)
1142              break;
1143
1144            base_stk [stk_idx] = a;
1145            end_stk  [stk_idx] = end;
1146            bit_stk  [stk_idx] = bit;
1147            ++stk_idx;
1148
1149            end = a;
1150          }
1151      }
1152    while (stk_idx--);
1153  }
1154}
1155
1156static void
1157eio_dent_insertion_sort (eio_dirent *dents, int size)
1158{
1159  /* first move the smallest element to the front, to act as a sentinel */
1160  {
1161    int i;
1162    eio_dirent *min = dents;
1163    
1164    /* the radix pre-pass ensures that the minimum element is in the first EIO_SORT_CUTOFF + 1 elements */
1165    for (i = size > EIO_SORT_FAST ? EIO_SORT_CUTOFF + 1 : size; --i; )
1166      if (EIO_DENT_CMP (dents [i], <, *min))
1167        min = &dents [i];
1168
1169    /* swap elements 0 and j (minimum) */
1170    {
1171      eio_dirent tmp = *dents; *dents = *min; *min = tmp;
1172    }
1173  }
1174
1175  /* then do standard insertion sort, assuming that all elements are >= dents [0] */
1176  {
1177    eio_dirent *i, *j;
1178
1179    for (i = dents + 1; i < dents + size; ++i)
1180      {
1181        eio_dirent value = *i;
1182
1183        for (j = i - 1; EIO_DENT_CMP (*j, >, value); --j)
1184          j [1] = j [0];
1185
1186        j [1] = value;
1187      }
1188  }
1189}
1190
1191static void
1192eio_dent_sort (eio_dirent *dents, int size, signed char score_bits, ino_t inode_bits)
1193{
1194  if (size <= 1)
1195    return; /* our insertion sort relies on size > 0 */
1196
1197  /* first we use a radix sort, but only for dirs >= EIO_SORT_FAST */
1198  /* and stop sorting when the partitions are <= EIO_SORT_CUTOFF */
1199  eio_dent_radix_sort (dents, size, score_bits, inode_bits);
1200
1201  /* use an insertion sort at the end, or for small arrays, */
1202  /* as insertion sort is more efficient for small partitions */
1203  eio_dent_insertion_sort (dents, size);
1204}
1205
1206/* read a full directory */
1207static void
1208eio__scandir (eio_req *req, etp_worker *self)
1209{
1210  DIR *dirp;
1211  EIO_STRUCT_DIRENT *entp;
1212  char *name, *names;
1213  int namesalloc = 4096;
1214  int namesoffs = 0;
1215  int flags = req->int1;
1216  eio_dirent *dents = 0;
1217  int dentalloc = 128;
1218  int dentoffs = 0;
1219  ino_t inode_bits = 0;
1220
1221  req->result = -1;
1222
1223  if (!(flags & EIO_READDIR_DENTS))
1224    flags &= ~(EIO_READDIR_DIRS_FIRST | EIO_READDIR_STAT_ORDER);
1225
1226  X_LOCK (wrklock);
1227  /* the corresponding closedir is in ETP_WORKER_CLEAR */
1228  self->dirp = dirp = opendir (req->ptr1);
1229  req->flags |= EIO_FLAG_PTR1_FREE | EIO_FLAG_PTR2_FREE;
1230  req->ptr1 = dents = flags ? malloc (dentalloc * sizeof (eio_dirent)) : 0;
1231  req->ptr2 = names = malloc (namesalloc);
1232  X_UNLOCK (wrklock);
1233
1234  if (dirp && names && (!flags || dents))
1235    for (;;)
1236      {
1237        errno = 0;
1238        entp = readdir (dirp);
1239
1240        if (!entp)
1241          {
1242            if (errno)
1243              break;
1244
1245            /* sort etc. */
1246            req->int1   = flags;
1247            req->result = dentoffs;
1248
1249            if (flags & EIO_READDIR_STAT_ORDER)
1250              eio_dent_sort (dents, dentoffs, 0, inode_bits); /* sort by inode exclusively */
1251            else if (flags & EIO_READDIR_DIRS_FIRST)
1252              if (flags & EIO_READDIR_FOUND_UNKNOWN)
1253                eio_dent_sort (dents, dentoffs, 7, inode_bits); /* sort by score and inode */
1254              else
1255                {
1256                  /* in this case, all is known, and we just put dirs first and sort them */
1257                  eio_dirent *oth = dents + dentoffs;
1258                  eio_dirent *dir = dents;
1259
1260                  /* now partition dirs to the front, and non-dirs to the back */
1261                  /* by walking from both sides and swapping if necessary */
1262                  /* also clear score, so it doesn't influence sorting */
1263                  while (oth > dir)
1264                    {
1265                      if (dir->type == EIO_DT_DIR)
1266                        ++dir;
1267                      else if ((--oth)->type == EIO_DT_DIR)
1268                        {
1269                          eio_dirent tmp = *dir; *dir = *oth; *oth = tmp;
1270
1271                          ++dir;
1272                        }
1273                    }
1274
1275                  /* now sort the dirs only */
1276                  eio_dent_sort (dents, dir - dents, 0, inode_bits);
1277                }
1278
1279            break;
1280          }
1281
1282        /* now add the entry to our list(s) */
1283        name = entp->d_name;
1284
1285        /* skip . and .. entries */
1286        if (name [0] != '.' || (name [1] && (name [1] != '.' || name [2])))
1287          {
1288            int len = D_NAMLEN (entp) + 1;
1289
1290            while (expect_false (namesoffs + len > namesalloc))
1291              {
1292                namesalloc *= 2;
1293                X_LOCK (wrklock);
1294                req->ptr2 = names = realloc (names, namesalloc);
1295                X_UNLOCK (wrklock);
1296
1297                if (!names)
1298                  break;
1299              }
1300
1301            memcpy (names + namesoffs, name, len);
1302
1303            if (dents)
1304              {
1305                struct eio_dirent *ent;
1306
1307                if (expect_false (dentoffs == dentalloc))
1308                  {
1309                    dentalloc *= 2;
1310                    X_LOCK (wrklock);
1311                    req->ptr1 = dents = realloc (dents, dentalloc * sizeof (eio_dirent));
1312                    X_UNLOCK (wrklock);
1313
1314                    if (!dents)
1315                      break;
1316                  }
1317
1318                ent = dents + dentoffs;
1319
1320                ent->nameofs = namesoffs; /* rather dirtily we store the offset in the pointer */
1321                ent->namelen = len - 1;
1322                ent->inode   = D_INO (entp);
1323
1324                inode_bits |= ent->inode;
1325
1326                switch (D_TYPE (entp))
1327                  {
1328                    default:
1329                      ent->type = EIO_DT_UNKNOWN;
1330                      flags |= EIO_READDIR_FOUND_UNKNOWN;
1331                      break;
1332
1333                    #ifdef DT_FIFO
1334                      case DT_FIFO: ent->type = EIO_DT_FIFO; break;
1335                    #endif
1336                    #ifdef DT_CHR
1337                      case DT_CHR:  ent->type = EIO_DT_CHR;  break;
1338                    #endif          
1339                    #ifdef DT_MPC
1340                      case DT_MPC:  ent->type = EIO_DT_MPC;  break;
1341                    #endif          
1342                    #ifdef DT_DIR
1343                      case DT_DIR:  ent->type = EIO_DT_DIR;  break;
1344                    #endif          
1345                    #ifdef DT_NAM
1346                      case DT_NAM:  ent->type = EIO_DT_NAM;  break;
1347                    #endif          
1348                    #ifdef DT_BLK
1349                      case DT_BLK:  ent->type = EIO_DT_BLK;  break;
1350                    #endif          
1351                    #ifdef DT_MPB
1352                      case DT_MPB:  ent->type = EIO_DT_MPB;  break;
1353                    #endif          
1354                    #ifdef DT_REG
1355                      case DT_REG:  ent->type = EIO_DT_REG;  break;
1356                    #endif          
1357                    #ifdef DT_NWK
1358                      case DT_NWK:  ent->type = EIO_DT_NWK;  break;
1359                    #endif          
1360                    #ifdef DT_CMP
1361                      case DT_CMP:  ent->type = EIO_DT_CMP;  break;
1362                    #endif          
1363                    #ifdef DT_LNK
1364                      case DT_LNK:  ent->type = EIO_DT_LNK;  break;
1365                    #endif
1366                    #ifdef DT_SOCK
1367                      case DT_SOCK: ent->type = EIO_DT_SOCK; break;
1368                    #endif
1369                    #ifdef DT_DOOR
1370                      case DT_DOOR: ent->type = EIO_DT_DOOR; break;
1371                    #endif
1372                    #ifdef DT_WHT
1373                      case DT_WHT:  ent->type = EIO_DT_WHT;  break;
1374                    #endif
1375                  }
1376
1377                ent->score = 7;
1378
1379                if (flags & EIO_READDIR_DIRS_FIRST)
1380                  {
1381                    if (ent->type == EIO_DT_UNKNOWN)
1382                      {
1383                        if (*name == '.') /* leading dots are likely directories, and, in any case, rare */
1384                          ent->score = 1;
1385                        else if (!strchr (name, '.')) /* absense of dots indicate likely dirs */
1386                          ent->score = len <= 2 ? 4 - len : len <= 4 ? 4 : len <= 7 ? 5 : 6; /* shorter == more likely dir, but avoid too many classes */
1387                      }
1388                    else if (ent->type == EIO_DT_DIR)
1389                      ent->score = 0;
1390                  }
1391              }
1392
1393            namesoffs += len;
1394            ++dentoffs;
1395          }
1396
1397        if (EIO_CANCELLED (req))
1398          {
1399            errno = ECANCELED;
1400            break;
1401          }
1402      }
1403}
1404
1405#ifdef PAGESIZE
1406# define eio_pagesize() PAGESIZE
1407#else
1408static intptr_t
1409eio_pagesize (void)
1410{
1411  static intptr_t page;
1412
1413  if (!page)
1414    page = sysconf (_SC_PAGESIZE);
1415
1416  return page;
1417}
1418#endif
1419
1420static void
1421eio_page_align (void **addr, size_t *length)
1422{
1423  intptr_t mask = eio_pagesize () - 1;
1424
1425  /* round down addr */
1426  intptr_t adj = mask & (intptr_t)*addr;
1427
1428  *addr   = (void *)((intptr_t)*addr - adj);
1429  *length += adj;
1430
1431  /* round up length */
1432  *length = (*length + mask) & ~mask;
1433}
1434
1435#if !_POSIX_MEMLOCK
1436# define eio__mlockall(a) ((errno = ENOSYS), -1)
1437#else
1438
1439static int
1440eio__mlockall (int flags)
1441{
1442  #if __GLIBC__ == 2 && __GLIBC_MINOR__ <= 7
1443    extern int mallopt (int, int);
1444    mallopt (-6, 238); /* http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=473812 */
1445  #endif
1446
1447  if (EIO_MCL_CURRENT   != MCL_CURRENT
1448      || EIO_MCL_FUTURE != MCL_FUTURE)
1449    {
1450      flags = 0
1451         | (flags & EIO_MCL_CURRENT ? MCL_CURRENT : 0)
1452         | (flags & EIO_MCL_FUTURE  ? MCL_FUTURE : 0);
1453    }
1454
1455  return mlockall (flags);
1456}
1457#endif
1458
1459#if !_POSIX_MEMLOCK_RANGE
1460# define eio__mlock(a,b) ((errno = ENOSYS), -1)
1461#else
1462
1463static int
1464eio__mlock (void *addr, size_t length)
1465{
1466  eio_page_align (&addr, &length);
1467
1468  return mlock (addr, length);
1469}
1470
1471#endif
1472
1473#if !(_POSIX_MAPPED_FILES && _POSIX_SYNCHRONIZED_IO)
1474# define eio__msync(a,b,c) ((errno = ENOSYS), -1)
1475#else
1476
1477int
1478eio__msync (void *mem, size_t len, int flags)
1479{
1480  eio_page_align (&mem, &len);
1481
1482  if (EIO_MS_ASYNC         != MS_SYNC
1483      || EIO_MS_INVALIDATE != MS_INVALIDATE
1484      || EIO_MS_SYNC       != MS_SYNC)
1485    {
1486      flags = 0
1487         | (flags & EIO_MS_ASYNC      ? MS_ASYNC : 0)
1488         | (flags & EIO_MS_INVALIDATE ? MS_INVALIDATE : 0)
1489         | (flags & EIO_MS_SYNC       ? MS_SYNC : 0);
1490    }
1491
1492  return msync (mem, len, flags);
1493}
1494
1495#endif
1496
1497int
1498eio__mtouch (void *mem, size_t len, int flags)
1499{
1500  eio_page_align (&mem, &len);
1501
1502  {
1503    intptr_t addr = (intptr_t)mem;
1504    intptr_t end = addr + len;
1505    intptr_t page = eio_pagesize ();
1506
1507    if (addr < end)
1508      if (flags & EIO_MT_MODIFY) /* modify */
1509        do { *((volatile sig_atomic_t *)addr) |= 0; } while ((addr += page) < len);
1510      else
1511        do { *((volatile sig_atomic_t *)addr)     ; } while ((addr += page) < len);
1512  }
1513
1514  return 0;
1515}
1516
1517/*****************************************************************************/
1518
1519#define ALLOC(len)				\
1520  if (!req->ptr2)				\
1521    {						\
1522      X_LOCK (wrklock);				\
1523      req->flags |= EIO_FLAG_PTR2_FREE;		\
1524      X_UNLOCK (wrklock);			\
1525      req->ptr2 = malloc (len);			\
1526      if (!req->ptr2)				\
1527        {					\
1528          errno       = ENOMEM;			\
1529          req->result = -1;			\
1530          break;				\
1531        }					\
1532    }
1533
1534X_THREAD_PROC (etp_proc)
1535{
1536  ETP_REQ *req;
1537  struct timespec ts;
1538  etp_worker *self = (etp_worker *)thr_arg;
1539
1540  /* try to distribute timeouts somewhat randomly */
1541  ts.tv_nsec = ((unsigned long)self & 1023UL) * (1000000000UL / 1024UL);
1542
1543  for (;;)
1544    {
1545      X_LOCK (reqlock);
1546
1547      for (;;)
1548        {
1549          self->req = req = reqq_shift (&req_queue);
1550
1551          if (req)
1552            break;
1553
1554          ++idle;
1555
1556          ts.tv_sec = time (0) + IDLE_TIMEOUT;
1557          if (X_COND_TIMEDWAIT (reqwait, reqlock, ts) == ETIMEDOUT)
1558            {
1559              if (idle > max_idle)
1560                {
1561                  --idle;
1562                  X_UNLOCK (reqlock);
1563                  X_LOCK (wrklock);
1564                  --started;
1565                  X_UNLOCK (wrklock);
1566                  goto quit;
1567                }
1568
1569              /* we are allowed to idle, so do so without any timeout */
1570              X_COND_WAIT (reqwait, reqlock);
1571            }
1572
1573          --idle;
1574        }
1575
1576      --nready;
1577
1578      X_UNLOCK (reqlock);
1579     
1580      if (req->type < 0)
1581        goto quit;
1582
1583      if (!EIO_CANCELLED (req))
1584        ETP_EXECUTE (self, req);
1585
1586      X_LOCK (reslock);
1587
1588      ++npending;
1589
1590      if (!reqq_push (&res_queue, req) && want_poll_cb)
1591        want_poll_cb ();
1592
1593      self->req = 0;
1594      etp_worker_clear (self);
1595
1596      X_UNLOCK (reslock);
1597    }
1598
1599quit:
1600  X_LOCK (wrklock);
1601  etp_worker_free (self);
1602  X_UNLOCK (wrklock);
1603
1604  return 0;
1605}
1606
1607/*****************************************************************************/
1608
1609int eio_init (void (*want_poll)(void), void (*done_poll)(void))
1610{
1611  return etp_init (want_poll, done_poll);
1612}
1613
1614static void eio_api_destroy (eio_req *req)
1615{
1616  free (req);
1617}
1618
1619#define REQ(rtype)                                            	\
1620  eio_req *req;                                                 \
1621                                                                \
1622  req = (eio_req *)calloc (1, sizeof *req);                     \
1623  if (!req)                                                     \
1624    return 0;                                                   \
1625                                                                \
1626  req->type    = rtype;                                         \
1627  req->pri     = pri;						\
1628  req->finish  = cb;						\
1629  req->data    = data;						\
1630  req->destroy = eio_api_destroy;
1631
1632#define SEND eio_submit (req); return req
1633
1634#define PATH							\
1635  req->flags |= EIO_FLAG_PTR1_FREE;				\
1636  req->ptr1 = strdup (path);					\
1637  if (!req->ptr1)						\
1638    {								\
1639      eio_api_destroy (req);					\
1640      return 0;							\
1641    }
1642
1643static void eio_execute (etp_worker *self, eio_req *req)
1644{
1645  switch (req->type)
1646    {
1647      case EIO_READ:      ALLOC (req->size);
1648                          req->result = req->offs >= 0
1649                                      ? pread     (req->int1, req->ptr2, req->size, req->offs)
1650                                      : read      (req->int1, req->ptr2, req->size); break;
1651      case EIO_WRITE:     req->result = req->offs >= 0
1652                                      ? pwrite    (req->int1, req->ptr2, req->size, req->offs)
1653                                      : write     (req->int1, req->ptr2, req->size); break;
1654
1655      case EIO_READAHEAD: req->result = readahead     (req->int1, req->offs, req->size); break;
1656      case EIO_SENDFILE:  req->result = eio__sendfile (req->int1, req->int2, req->offs, req->size, self); break;
1657
1658      case EIO_STAT:      ALLOC (sizeof (EIO_STRUCT_STAT));
1659                          req->result = stat      (req->ptr1, (EIO_STRUCT_STAT *)req->ptr2); break;
1660      case EIO_LSTAT:     ALLOC (sizeof (EIO_STRUCT_STAT));
1661                          req->result = lstat     (req->ptr1, (EIO_STRUCT_STAT *)req->ptr2); break;
1662      case EIO_FSTAT:     ALLOC (sizeof (EIO_STRUCT_STAT));
1663                          req->result = fstat     (req->int1, (EIO_STRUCT_STAT *)req->ptr2); break;
1664
1665      case EIO_STATVFS:   ALLOC (sizeof (EIO_STRUCT_STATVFS));
1666                          req->result = statvfs   (req->ptr1, (EIO_STRUCT_STATVFS *)req->ptr2); break;
1667      case EIO_FSTATVFS:  ALLOC (sizeof (EIO_STRUCT_STATVFS));
1668                          req->result = fstatvfs  (req->int1, (EIO_STRUCT_STATVFS *)req->ptr2); break;
1669
1670      case EIO_CHOWN:     req->result = chown     (req->ptr1, req->int2, req->int3); break;
1671      case EIO_FCHOWN:    req->result = fchown    (req->int1, req->int2, req->int3); break;
1672      case EIO_CHMOD:     req->result = chmod     (req->ptr1, (mode_t)req->int2); break;
1673      case EIO_FCHMOD:    req->result = fchmod    (req->int1, (mode_t)req->int2); break;
1674      case EIO_TRUNCATE:  req->result = truncate  (req->ptr1, req->offs); break;
1675      case EIO_FTRUNCATE: req->result = ftruncate (req->int1, req->offs); break;
1676
1677      case EIO_OPEN:      req->result = open      (req->ptr1, req->int1, (mode_t)req->int2); break;
1678      case EIO_CLOSE:     req->result = close     (req->int1); break;
1679      case EIO_DUP2:      req->result = dup2      (req->int1, req->int2); break;
1680      case EIO_UNLINK:    req->result = unlink    (req->ptr1); break;
1681      case EIO_RMDIR:     req->result = rmdir     (req->ptr1); break;
1682      case EIO_MKDIR:     req->result = mkdir     (req->ptr1, (mode_t)req->int2); break;
1683      case EIO_RENAME:    req->result = rename    (req->ptr1, req->ptr2); break;
1684      case EIO_LINK:      req->result = link      (req->ptr1, req->ptr2); break;
1685      case EIO_SYMLINK:   req->result = symlink   (req->ptr1, req->ptr2); break;
1686      case EIO_MKNOD:     req->result = mknod     (req->ptr1, (mode_t)req->int2, (dev_t)req->int3); break;
1687
1688      case EIO_READLINK:  ALLOC (PATH_MAX);
1689                          req->result = readlink  (req->ptr1, req->ptr2, PATH_MAX); break;
1690
1691      case EIO_SYNC:      req->result = 0; sync (); break;
1692      case EIO_FSYNC:     req->result = fsync     (req->int1); break;
1693      case EIO_FDATASYNC: req->result = fdatasync (req->int1); break;
1694      case EIO_MSYNC:     req->result = eio__msync (req->ptr2, req->size, req->int1); break;
1695      case EIO_MTOUCH:    req->result = eio__mtouch (req->ptr2, req->size, req->int1); break;
1696      case EIO_MLOCK:     req->result = eio__mlock (req->ptr2, req->size); break;
1697      case EIO_MLOCKALL:  req->result = eio__mlockall (req->int1); break;
1698      case EIO_SYNC_FILE_RANGE: req->result = eio__sync_file_range (req->int1, req->offs, req->size, req->int2); break;
1699
1700      case EIO_READDIR:   eio__scandir (req, self); break;
1701
1702      case EIO_BUSY:
1703#ifdef _WIN32
1704	Sleep (req->nv1 * 1e3);
1705#else
1706        {
1707          struct timeval tv;
1708
1709          tv.tv_sec  = req->nv1;
1710          tv.tv_usec = (req->nv1 - tv.tv_sec) * 1e6;
1711
1712          req->result = select (0, 0, 0, 0, &tv);
1713        }
1714#endif
1715        break;
1716
1717      case EIO_UTIME:
1718      case EIO_FUTIME:
1719        {
1720          struct timeval tv[2];
1721          struct timeval *times;
1722
1723          if (req->nv1 != -1. || req->nv2 != -1.)
1724            {
1725              tv[0].tv_sec  = req->nv1;
1726              tv[0].tv_usec = (req->nv1 - tv[0].tv_sec) * 1000000.;
1727              tv[1].tv_sec  = req->nv2;
1728              tv[1].tv_usec = (req->nv2 - tv[1].tv_sec) * 1000000.;
1729
1730              times = tv;
1731            }
1732          else
1733            times = 0;
1734
1735          req->result = req->type == EIO_FUTIME
1736                        ? futimes (req->int1, times)
1737                        : utimes  (req->ptr1, times);
1738        }
1739        break;
1740
1741      case EIO_GROUP:
1742        abort (); /* handled in eio_request */
1743
1744      case EIO_NOP:
1745        req->result = 0;
1746        break;
1747
1748      case EIO_CUSTOM:
1749        ((void (*)(eio_req *))req->feed) (req);
1750        break;
1751
1752      default:
1753        errno = ENOSYS;
1754        req->result = -1;
1755        break;
1756    }
1757
1758  req->errorno = errno;
1759}
1760
1761#ifndef EIO_NO_WRAPPERS
1762
1763eio_req *eio_nop (int pri, eio_cb cb, void *data)
1764{
1765  REQ (EIO_NOP); SEND;
1766}
1767
1768eio_req *eio_busy (double delay, int pri, eio_cb cb, void *data)
1769{
1770  REQ (EIO_BUSY); req->nv1 = delay; SEND;
1771}
1772
1773eio_req *eio_sync (int pri, eio_cb cb, void *data)
1774{
1775  REQ (EIO_SYNC); SEND;
1776}
1777
1778eio_req *eio_fsync (int fd, int pri, eio_cb cb, void *data)
1779{
1780  REQ (EIO_FSYNC); req->int1 = fd; SEND;
1781}
1782
1783eio_req *eio_msync (void *addr, size_t length, int flags, int pri, eio_cb cb, void *data)
1784{
1785  REQ (EIO_MSYNC); req->ptr2 = addr; req->size = length; req->int1 = flags; SEND;
1786}
1787
1788eio_req *eio_mtouch (void *addr, size_t length, int flags, int pri, eio_cb cb, void *data)
1789{
1790  REQ (EIO_MTOUCH); req->ptr2 = addr; req->size = length; req->int1 = flags; SEND;
1791}
1792
1793eio_req *eio_mlock (void *addr, size_t length, int pri, eio_cb cb, void *data)
1794{
1795  REQ (EIO_MLOCK); req->ptr2 = addr; req->size = length; SEND;
1796}
1797
1798eio_req *eio_mlockall (int flags, int pri, eio_cb cb, void *data)
1799{
1800  REQ (EIO_MLOCKALL); req->int1 = flags; SEND;
1801}
1802
1803eio_req *eio_sync_file_range (int fd, off_t offset, size_t nbytes, unsigned int flags, int pri, eio_cb cb, void *data)
1804{
1805  REQ (EIO_SYNC_FILE_RANGE); req->int1 = fd; req->offs = offset; req->size = nbytes; req->int2 = flags; SEND;
1806}
1807
1808eio_req *eio_fdatasync (int fd, int pri, eio_cb cb, void *data)
1809{
1810  REQ (EIO_FDATASYNC); req->int1 = fd; SEND;
1811}
1812
1813eio_req *eio_close (int fd, int pri, eio_cb cb, void *data)
1814{
1815  REQ (EIO_CLOSE); req->int1 = fd; SEND;
1816}
1817
1818eio_req *eio_readahead (int fd, off_t offset, size_t length, int pri, eio_cb cb, void *data)
1819{
1820  REQ (EIO_READAHEAD); req->int1 = fd; req->offs = offset; req->size = length; SEND;
1821}
1822
1823eio_req *eio_read (int fd, void *buf, size_t length, off_t offset, int pri, eio_cb cb, void *data)
1824{
1825  REQ (EIO_READ); req->int1 = fd; req->offs = offset; req->size = length; req->ptr2 = buf; SEND;
1826}
1827
1828eio_req *eio_write (int fd, void *buf, size_t length, off_t offset, int pri, eio_cb cb, void *data)
1829{
1830  REQ (EIO_WRITE); req->int1 = fd; req->offs = offset; req->size = length; req->ptr2 = buf; SEND;
1831}
1832
1833eio_req *eio_fstat (int fd, int pri, eio_cb cb, void *data)
1834{
1835  REQ (EIO_FSTAT); req->int1 = fd; SEND;
1836}
1837
1838eio_req *eio_fstatvfs (int fd, int pri, eio_cb cb, void *data)
1839{
1840  REQ (EIO_FSTATVFS); req->int1 = fd; SEND;
1841}
1842
1843eio_req *eio_futime (int fd, double atime, double mtime, int pri, eio_cb cb, void *data)
1844{
1845  REQ (EIO_FUTIME); req->int1 = fd; req->nv1 = atime; req->nv2 = mtime; SEND;
1846}
1847
1848eio_req *eio_ftruncate (int fd, off_t offset, int pri, eio_cb cb, void *data)
1849{
1850  REQ (EIO_FTRUNCATE); req->int1 = fd; req->offs = offset; SEND;
1851}
1852
1853eio_req *eio_fchmod (int fd, mode_t mode, int pri, eio_cb cb, void *data)
1854{
1855  REQ (EIO_FCHMOD); req->int1 = fd; req->int2 = (long)mode; SEND;
1856}
1857
1858eio_req *eio_fchown (int fd, uid_t uid, gid_t gid, int pri, eio_cb cb, void *data)
1859{
1860  REQ (EIO_FCHOWN); req->int1 = fd; req->int2 = (long)uid; req->int3 = (long)gid; SEND;
1861}
1862
1863eio_req *eio_dup2 (int fd, int fd2, int pri, eio_cb cb, void *data)
1864{
1865  REQ (EIO_DUP2); req->int1 = fd; req->int2 = fd2; SEND;
1866}
1867
1868eio_req *eio_sendfile (int out_fd, int in_fd, off_t in_offset, size_t length, int pri, eio_cb cb, void *data)
1869{
1870  REQ (EIO_SENDFILE); req->int1 = out_fd; req->int2 = in_fd; req->offs = in_offset; req->size = length; SEND;
1871}
1872
1873eio_req *eio_open (const char *path, int flags, mode_t mode, int pri, eio_cb cb, void *data)
1874{
1875  REQ (EIO_OPEN); PATH; req->int1 = flags; req->int2 = (long)mode; SEND;
1876}
1877
1878eio_req *eio_utime (const char *path, double atime, double mtime, int pri, eio_cb cb, void *data)
1879{
1880  REQ (EIO_UTIME); PATH; req->nv1 = atime; req->nv2 = mtime; SEND;
1881}
1882
1883eio_req *eio_truncate (const char *path, off_t offset, int pri, eio_cb cb, void *data)
1884{
1885  REQ (EIO_TRUNCATE); PATH; req->offs = offset; SEND;
1886}
1887
1888eio_req *eio_chown (const char *path, uid_t uid, gid_t gid, int pri, eio_cb cb, void *data)
1889{
1890  REQ (EIO_CHOWN); PATH; req->int2 = (long)uid; req->int3 = (long)gid; SEND;
1891}
1892
1893eio_req *eio_chmod (const char *path, mode_t mode, int pri, eio_cb cb, void *data)
1894{
1895  REQ (EIO_CHMOD); PATH; req->int2 = (long)mode; SEND;
1896}
1897
1898eio_req *eio_mkdir (const char *path, mode_t mode, int pri, eio_cb cb, void *data)
1899{
1900  REQ (EIO_MKDIR); PATH; req->int2 = (long)mode; SEND;
1901}
1902
1903static eio_req *
1904eio__1path (int type, const char *path, int pri, eio_cb cb, void *data)
1905{
1906  REQ (type); PATH; SEND;
1907}
1908
1909eio_req *eio_readlink (const char *path, int pri, eio_cb cb, void *data)
1910{
1911  return eio__1path (EIO_READLINK, path, pri, cb, data);
1912}
1913
1914eio_req *eio_stat (const char *path, int pri, eio_cb cb, void *data)
1915{
1916  return eio__1path (EIO_STAT, path, pri, cb, data);
1917}
1918
1919eio_req *eio_lstat (const char *path, int pri, eio_cb cb, void *data)
1920{
1921  return eio__1path (EIO_LSTAT, path, pri, cb, data);
1922}
1923
1924eio_req *eio_statvfs (const char *path, int pri, eio_cb cb, void *data)
1925{
1926  return eio__1path (EIO_STATVFS, path, pri, cb, data);
1927}
1928
1929eio_req *eio_unlink (const char *path, int pri, eio_cb cb, void *data)
1930{
1931  return eio__1path (EIO_UNLINK, path, pri, cb, data);
1932}
1933
1934eio_req *eio_rmdir (const char *path, int pri, eio_cb cb, void *data)
1935{
1936  return eio__1path (EIO_RMDIR, path, pri, cb, data);
1937}
1938
1939eio_req *eio_readdir (const char *path, int flags, int pri, eio_cb cb, void *data)
1940{
1941  REQ (EIO_READDIR); PATH; req->int1 = flags; SEND;
1942}
1943
1944eio_req *eio_mknod (const char *path, mode_t mode, dev_t dev, int pri, eio_cb cb, void *data)
1945{
1946  REQ (EIO_MKNOD); PATH; req->int2 = (long)mode; req

Large files files are truncated, but you can click here to view the full file