| 1 | /* $NetBSD: sys_select.c,v 1.39 2014/04/25 15:52:45 pooka Exp $ */ |
| 2 | |
| 3 | /*- |
| 4 | * Copyright (c) 2007, 2008, 2009, 2010 The NetBSD Foundation, Inc. |
| 5 | * All rights reserved. |
| 6 | * |
| 7 | * This code is derived from software contributed to The NetBSD Foundation |
| 8 | * by Andrew Doran and Mindaugas Rasiukevicius. |
| 9 | * |
| 10 | * Redistribution and use in source and binary forms, with or without |
| 11 | * modification, are permitted provided that the following conditions |
| 12 | * are met: |
| 13 | * 1. Redistributions of source code must retain the above copyright |
| 14 | * notice, this list of conditions and the following disclaimer. |
| 15 | * 2. Redistributions in binary form must reproduce the above copyright |
| 16 | * notice, this list of conditions and the following disclaimer in the |
| 17 | * documentation and/or other materials provided with the distribution. |
| 18 | * |
| 19 | * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS |
| 20 | * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED |
| 21 | * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
| 22 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS |
| 23 | * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
| 24 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
| 25 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
| 26 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
| 27 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
| 28 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
| 29 | * POSSIBILITY OF SUCH DAMAGE. |
| 30 | */ |
| 31 | |
| 32 | /* |
| 33 | * Copyright (c) 1982, 1986, 1989, 1993 |
| 34 | * The Regents of the University of California. All rights reserved. |
| 35 | * (c) UNIX System Laboratories, Inc. |
| 36 | * All or some portions of this file are derived from material licensed |
| 37 | * to the University of California by American Telephone and Telegraph |
| 38 | * Co. or Unix System Laboratories, Inc. and are reproduced herein with |
| 39 | * the permission of UNIX System Laboratories, Inc. |
| 40 | * |
| 41 | * Redistribution and use in source and binary forms, with or without |
| 42 | * modification, are permitted provided that the following conditions |
| 43 | * are met: |
| 44 | * 1. Redistributions of source code must retain the above copyright |
| 45 | * notice, this list of conditions and the following disclaimer. |
| 46 | * 2. Redistributions in binary form must reproduce the above copyright |
| 47 | * notice, this list of conditions and the following disclaimer in the |
| 48 | * documentation and/or other materials provided with the distribution. |
| 49 | * 3. Neither the name of the University nor the names of its contributors |
| 50 | * may be used to endorse or promote products derived from this software |
| 51 | * without specific prior written permission. |
| 52 | * |
| 53 | * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND |
| 54 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| 55 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| 56 | * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE |
| 57 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
| 58 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
| 59 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
| 60 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
| 61 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
| 62 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
| 63 | * SUCH DAMAGE. |
| 64 | * |
| 65 | * @(#)sys_generic.c 8.9 (Berkeley) 2/14/95 |
| 66 | */ |
| 67 | |
| 68 | /* |
| 69 | * System calls of synchronous I/O multiplexing subsystem. |
| 70 | * |
| 71 | * Locking |
| 72 | * |
| 73 | * Two locks are used: <object-lock> and selcluster_t::sc_lock. |
| 74 | * |
| 75 | * The <object-lock> might be a device driver or another subsystem, e.g. |
| 76 | * socket or pipe. This lock is not exported, and thus invisible to this |
| 77 | * subsystem. Mainly, synchronisation between selrecord() and selnotify() |
| 78 | * routines depends on this lock, as it will be described in the comments. |
| 79 | * |
| 80 | * Lock order |
| 81 | * |
| 82 | * <object-lock> -> |
| 83 | * selcluster_t::sc_lock |
| 84 | */ |
| 85 | |
| 86 | #include <sys/cdefs.h> |
| 87 | __KERNEL_RCSID(0, "$NetBSD: sys_select.c,v 1.39 2014/04/25 15:52:45 pooka Exp $" ); |
| 88 | |
| 89 | #include <sys/param.h> |
| 90 | #include <sys/systm.h> |
| 91 | #include <sys/filedesc.h> |
| 92 | #include <sys/file.h> |
| 93 | #include <sys/proc.h> |
| 94 | #include <sys/socketvar.h> |
| 95 | #include <sys/signalvar.h> |
| 96 | #include <sys/uio.h> |
| 97 | #include <sys/kernel.h> |
| 98 | #include <sys/lwp.h> |
| 99 | #include <sys/poll.h> |
| 100 | #include <sys/mount.h> |
| 101 | #include <sys/syscallargs.h> |
| 102 | #include <sys/cpu.h> |
| 103 | #include <sys/atomic.h> |
| 104 | #include <sys/socketvar.h> |
| 105 | #include <sys/sleepq.h> |
| 106 | #include <sys/sysctl.h> |
| 107 | |
| 108 | /* Flags for lwp::l_selflag. */ |
| 109 | #define SEL_RESET 0 /* awoken, interrupted, or not yet polling */ |
| 110 | #define SEL_SCANNING 1 /* polling descriptors */ |
| 111 | #define SEL_BLOCKING 2 /* blocking and waiting for event */ |
| 112 | #define SEL_EVENT 3 /* interrupted, events set directly */ |
| 113 | |
| 114 | /* Operations: either select() or poll(). */ |
| 115 | #define SELOP_SELECT 1 |
| 116 | #define SELOP_POLL 2 |
| 117 | |
| 118 | /* |
| 119 | * Per-cluster state for select()/poll(). For a system with fewer |
| 120 | * than 32 CPUs, this gives us per-CPU clusters. |
| 121 | */ |
| 122 | #define SELCLUSTERS 32 |
| 123 | #define SELCLUSTERMASK (SELCLUSTERS - 1) |
| 124 | |
| 125 | typedef struct selcluster { |
| 126 | kmutex_t *sc_lock; |
| 127 | sleepq_t sc_sleepq; |
| 128 | int sc_ncoll; |
| 129 | uint32_t sc_mask; |
| 130 | } selcluster_t; |
| 131 | |
| 132 | static inline int selscan(char *, const int, const size_t, register_t *); |
| 133 | static inline int pollscan(struct pollfd *, const int, register_t *); |
| 134 | static void selclear(void); |
| 135 | |
| 136 | static const int sel_flag[] = { |
| 137 | POLLRDNORM | POLLHUP | POLLERR, |
| 138 | POLLWRNORM | POLLHUP | POLLERR, |
| 139 | POLLRDBAND |
| 140 | }; |
| 141 | |
| 142 | static syncobj_t select_sobj = { |
| 143 | SOBJ_SLEEPQ_FIFO, |
| 144 | sleepq_unsleep, |
| 145 | sleepq_changepri, |
| 146 | sleepq_lendpri, |
| 147 | syncobj_noowner, |
| 148 | }; |
| 149 | |
| 150 | static selcluster_t *selcluster[SELCLUSTERS] __read_mostly; |
| 151 | static int direct_select __read_mostly = 0; |
| 152 | |
| 153 | /* |
| 154 | * Select system call. |
| 155 | */ |
| 156 | int |
| 157 | sys___pselect50(struct lwp *l, const struct sys___pselect50_args *uap, |
| 158 | register_t *retval) |
| 159 | { |
| 160 | /* { |
| 161 | syscallarg(int) nd; |
| 162 | syscallarg(fd_set *) in; |
| 163 | syscallarg(fd_set *) ou; |
| 164 | syscallarg(fd_set *) ex; |
| 165 | syscallarg(const struct timespec *) ts; |
| 166 | syscallarg(sigset_t *) mask; |
| 167 | } */ |
| 168 | struct timespec ats, *ts = NULL; |
| 169 | sigset_t amask, *mask = NULL; |
| 170 | int error; |
| 171 | |
| 172 | if (SCARG(uap, ts)) { |
| 173 | error = copyin(SCARG(uap, ts), &ats, sizeof(ats)); |
| 174 | if (error) |
| 175 | return error; |
| 176 | ts = &ats; |
| 177 | } |
| 178 | if (SCARG(uap, mask) != NULL) { |
| 179 | error = copyin(SCARG(uap, mask), &amask, sizeof(amask)); |
| 180 | if (error) |
| 181 | return error; |
| 182 | mask = &amask; |
| 183 | } |
| 184 | |
| 185 | return selcommon(retval, SCARG(uap, nd), SCARG(uap, in), |
| 186 | SCARG(uap, ou), SCARG(uap, ex), ts, mask); |
| 187 | } |
| 188 | |
| 189 | int |
| 190 | sys___select50(struct lwp *l, const struct sys___select50_args *uap, |
| 191 | register_t *retval) |
| 192 | { |
| 193 | /* { |
| 194 | syscallarg(int) nd; |
| 195 | syscallarg(fd_set *) in; |
| 196 | syscallarg(fd_set *) ou; |
| 197 | syscallarg(fd_set *) ex; |
| 198 | syscallarg(struct timeval *) tv; |
| 199 | } */ |
| 200 | struct timeval atv; |
| 201 | struct timespec ats, *ts = NULL; |
| 202 | int error; |
| 203 | |
| 204 | if (SCARG(uap, tv)) { |
| 205 | error = copyin(SCARG(uap, tv), (void *)&atv, sizeof(atv)); |
| 206 | if (error) |
| 207 | return error; |
| 208 | TIMEVAL_TO_TIMESPEC(&atv, &ats); |
| 209 | ts = &ats; |
| 210 | } |
| 211 | |
| 212 | return selcommon(retval, SCARG(uap, nd), SCARG(uap, in), |
| 213 | SCARG(uap, ou), SCARG(uap, ex), ts, NULL); |
| 214 | } |
| 215 | |
| 216 | /* |
| 217 | * sel_do_scan: common code to perform the scan on descriptors. |
| 218 | */ |
| 219 | static int |
| 220 | sel_do_scan(const int op, void *fds, const int nf, const size_t ni, |
| 221 | struct timespec *ts, sigset_t *mask, register_t *retval) |
| 222 | { |
| 223 | lwp_t * const l = curlwp; |
| 224 | selcluster_t *sc; |
| 225 | kmutex_t *lock; |
| 226 | struct timespec sleepts; |
| 227 | int error, timo; |
| 228 | |
| 229 | timo = 0; |
| 230 | if (ts && inittimeleft(ts, &sleepts) == -1) { |
| 231 | return EINVAL; |
| 232 | } |
| 233 | |
| 234 | if (__predict_false(mask)) |
| 235 | sigsuspendsetup(l, mask); |
| 236 | |
| 237 | sc = curcpu()->ci_data.cpu_selcluster; |
| 238 | lock = sc->sc_lock; |
| 239 | l->l_selcluster = sc; |
| 240 | if (op == SELOP_SELECT) { |
| 241 | l->l_selbits = fds; |
| 242 | l->l_selni = ni; |
| 243 | } else { |
| 244 | l->l_selbits = NULL; |
| 245 | } |
| 246 | |
| 247 | for (;;) { |
| 248 | int ncoll; |
| 249 | |
| 250 | SLIST_INIT(&l->l_selwait); |
| 251 | l->l_selret = 0; |
| 252 | |
| 253 | /* |
| 254 | * No need to lock. If this is overwritten by another value |
| 255 | * while scanning, we will retry below. We only need to see |
| 256 | * exact state from the descriptors that we are about to poll, |
| 257 | * and lock activity resulting from fo_poll is enough to |
| 258 | * provide an up to date value for new polling activity. |
| 259 | */ |
| 260 | l->l_selflag = SEL_SCANNING; |
| 261 | ncoll = sc->sc_ncoll; |
| 262 | |
| 263 | if (op == SELOP_SELECT) { |
| 264 | error = selscan((char *)fds, nf, ni, retval); |
| 265 | } else { |
| 266 | error = pollscan((struct pollfd *)fds, nf, retval); |
| 267 | } |
| 268 | if (error || *retval) |
| 269 | break; |
| 270 | if (ts && (timo = gettimeleft(ts, &sleepts)) <= 0) |
| 271 | break; |
| 272 | /* |
| 273 | * Acquire the lock and perform the (re)checks. Note, if |
| 274 | * collision has occured, then our state does not matter, |
| 275 | * as we must perform re-scan. Therefore, check it first. |
| 276 | */ |
| 277 | state_check: |
| 278 | mutex_spin_enter(lock); |
| 279 | if (__predict_false(sc->sc_ncoll != ncoll)) { |
| 280 | /* Collision: perform re-scan. */ |
| 281 | mutex_spin_exit(lock); |
| 282 | selclear(); |
| 283 | continue; |
| 284 | } |
| 285 | if (__predict_true(l->l_selflag == SEL_EVENT)) { |
| 286 | /* Events occured, they are set directly. */ |
| 287 | mutex_spin_exit(lock); |
| 288 | break; |
| 289 | } |
| 290 | if (__predict_true(l->l_selflag == SEL_RESET)) { |
| 291 | /* Events occured, but re-scan is requested. */ |
| 292 | mutex_spin_exit(lock); |
| 293 | selclear(); |
| 294 | continue; |
| 295 | } |
| 296 | /* Nothing happen, therefore - sleep. */ |
| 297 | l->l_selflag = SEL_BLOCKING; |
| 298 | l->l_kpriority = true; |
| 299 | sleepq_enter(&sc->sc_sleepq, l, lock); |
| 300 | sleepq_enqueue(&sc->sc_sleepq, sc, "select" , &select_sobj); |
| 301 | error = sleepq_block(timo, true); |
| 302 | if (error != 0) { |
| 303 | break; |
| 304 | } |
| 305 | /* Awoken: need to check the state. */ |
| 306 | goto state_check; |
| 307 | } |
| 308 | selclear(); |
| 309 | |
| 310 | /* Add direct events if any. */ |
| 311 | if (l->l_selflag == SEL_EVENT) { |
| 312 | KASSERT(l->l_selret != 0); |
| 313 | *retval += l->l_selret; |
| 314 | } |
| 315 | |
| 316 | if (__predict_false(mask)) |
| 317 | sigsuspendteardown(l); |
| 318 | |
| 319 | /* select and poll are not restarted after signals... */ |
| 320 | if (error == ERESTART) |
| 321 | return EINTR; |
| 322 | if (error == EWOULDBLOCK) |
| 323 | return 0; |
| 324 | return error; |
| 325 | } |
| 326 | |
| 327 | int |
| 328 | selcommon(register_t *retval, int nd, fd_set *u_in, fd_set *u_ou, |
| 329 | fd_set *u_ex, struct timespec *ts, sigset_t *mask) |
| 330 | { |
| 331 | char smallbits[howmany(FD_SETSIZE, NFDBITS) * |
| 332 | sizeof(fd_mask) * 6]; |
| 333 | char *bits; |
| 334 | int error, nf; |
| 335 | size_t ni; |
| 336 | |
| 337 | if (nd < 0) |
| 338 | return (EINVAL); |
| 339 | nf = curlwp->l_fd->fd_dt->dt_nfiles; |
| 340 | if (nd > nf) { |
| 341 | /* forgiving; slightly wrong */ |
| 342 | nd = nf; |
| 343 | } |
| 344 | ni = howmany(nd, NFDBITS) * sizeof(fd_mask); |
| 345 | if (ni * 6 > sizeof(smallbits)) { |
| 346 | bits = kmem_alloc(ni * 6, KM_SLEEP); |
| 347 | if (bits == NULL) |
| 348 | return ENOMEM; |
| 349 | } else |
| 350 | bits = smallbits; |
| 351 | |
| 352 | #define getbits(name, x) \ |
| 353 | if (u_ ## name) { \ |
| 354 | error = copyin(u_ ## name, bits + ni * x, ni); \ |
| 355 | if (error) \ |
| 356 | goto fail; \ |
| 357 | } else \ |
| 358 | memset(bits + ni * x, 0, ni); |
| 359 | getbits(in, 0); |
| 360 | getbits(ou, 1); |
| 361 | getbits(ex, 2); |
| 362 | #undef getbits |
| 363 | |
| 364 | error = sel_do_scan(SELOP_SELECT, bits, nd, ni, ts, mask, retval); |
| 365 | if (error == 0 && u_in != NULL) |
| 366 | error = copyout(bits + ni * 3, u_in, ni); |
| 367 | if (error == 0 && u_ou != NULL) |
| 368 | error = copyout(bits + ni * 4, u_ou, ni); |
| 369 | if (error == 0 && u_ex != NULL) |
| 370 | error = copyout(bits + ni * 5, u_ex, ni); |
| 371 | fail: |
| 372 | if (bits != smallbits) |
| 373 | kmem_free(bits, ni * 6); |
| 374 | return (error); |
| 375 | } |
| 376 | |
| 377 | static inline int |
| 378 | selscan(char *bits, const int nfd, const size_t ni, register_t *retval) |
| 379 | { |
| 380 | fd_mask *ibitp, *obitp; |
| 381 | int msk, i, j, fd, n; |
| 382 | file_t *fp; |
| 383 | |
| 384 | ibitp = (fd_mask *)(bits + ni * 0); |
| 385 | obitp = (fd_mask *)(bits + ni * 3); |
| 386 | n = 0; |
| 387 | |
| 388 | memset(obitp, 0, ni * 3); |
| 389 | for (msk = 0; msk < 3; msk++) { |
| 390 | for (i = 0; i < nfd; i += NFDBITS) { |
| 391 | fd_mask ibits, obits; |
| 392 | |
| 393 | ibits = *ibitp; |
| 394 | obits = 0; |
| 395 | while ((j = ffs(ibits)) && (fd = i + --j) < nfd) { |
| 396 | ibits &= ~(1 << j); |
| 397 | if ((fp = fd_getfile(fd)) == NULL) |
| 398 | return (EBADF); |
| 399 | /* |
| 400 | * Setup an argument to selrecord(), which is |
| 401 | * a file descriptor number. |
| 402 | */ |
| 403 | curlwp->l_selrec = fd; |
| 404 | if ((*fp->f_ops->fo_poll)(fp, sel_flag[msk])) { |
| 405 | obits |= (1 << j); |
| 406 | n++; |
| 407 | } |
| 408 | fd_putfile(fd); |
| 409 | } |
| 410 | if (obits != 0) { |
| 411 | if (direct_select) { |
| 412 | kmutex_t *lock; |
| 413 | lock = curlwp->l_selcluster->sc_lock; |
| 414 | mutex_spin_enter(lock); |
| 415 | *obitp |= obits; |
| 416 | mutex_spin_exit(lock); |
| 417 | } else { |
| 418 | *obitp |= obits; |
| 419 | } |
| 420 | } |
| 421 | ibitp++; |
| 422 | obitp++; |
| 423 | } |
| 424 | } |
| 425 | *retval = n; |
| 426 | return (0); |
| 427 | } |
| 428 | |
| 429 | /* |
| 430 | * Poll system call. |
| 431 | */ |
| 432 | int |
| 433 | sys_poll(struct lwp *l, const struct sys_poll_args *uap, register_t *retval) |
| 434 | { |
| 435 | /* { |
| 436 | syscallarg(struct pollfd *) fds; |
| 437 | syscallarg(u_int) nfds; |
| 438 | syscallarg(int) timeout; |
| 439 | } */ |
| 440 | struct timespec ats, *ts = NULL; |
| 441 | |
| 442 | if (SCARG(uap, timeout) != INFTIM) { |
| 443 | ats.tv_sec = SCARG(uap, timeout) / 1000; |
| 444 | ats.tv_nsec = (SCARG(uap, timeout) % 1000) * 1000000; |
| 445 | ts = &ats; |
| 446 | } |
| 447 | |
| 448 | return pollcommon(retval, SCARG(uap, fds), SCARG(uap, nfds), ts, NULL); |
| 449 | } |
| 450 | |
| 451 | /* |
| 452 | * Poll system call. |
| 453 | */ |
| 454 | int |
| 455 | sys___pollts50(struct lwp *l, const struct sys___pollts50_args *uap, |
| 456 | register_t *retval) |
| 457 | { |
| 458 | /* { |
| 459 | syscallarg(struct pollfd *) fds; |
| 460 | syscallarg(u_int) nfds; |
| 461 | syscallarg(const struct timespec *) ts; |
| 462 | syscallarg(const sigset_t *) mask; |
| 463 | } */ |
| 464 | struct timespec ats, *ts = NULL; |
| 465 | sigset_t amask, *mask = NULL; |
| 466 | int error; |
| 467 | |
| 468 | if (SCARG(uap, ts)) { |
| 469 | error = copyin(SCARG(uap, ts), &ats, sizeof(ats)); |
| 470 | if (error) |
| 471 | return error; |
| 472 | ts = &ats; |
| 473 | } |
| 474 | if (SCARG(uap, mask)) { |
| 475 | error = copyin(SCARG(uap, mask), &amask, sizeof(amask)); |
| 476 | if (error) |
| 477 | return error; |
| 478 | mask = &amask; |
| 479 | } |
| 480 | |
| 481 | return pollcommon(retval, SCARG(uap, fds), SCARG(uap, nfds), ts, mask); |
| 482 | } |
| 483 | |
| 484 | int |
| 485 | pollcommon(register_t *retval, struct pollfd *u_fds, u_int nfds, |
| 486 | struct timespec *ts, sigset_t *mask) |
| 487 | { |
| 488 | struct pollfd smallfds[32]; |
| 489 | struct pollfd *fds; |
| 490 | int error; |
| 491 | size_t ni; |
| 492 | |
| 493 | if (nfds > 1000 + curlwp->l_fd->fd_dt->dt_nfiles) { |
| 494 | /* |
| 495 | * Either the user passed in a very sparse 'fds' or junk! |
| 496 | * The kmem_alloc() call below would be bad news. |
| 497 | * We could process the 'fds' array in chunks, but that |
| 498 | * is a lot of code that isn't normally useful. |
| 499 | * (Or just move the copyin/out into pollscan().) |
| 500 | * Historically the code silently truncated 'fds' to |
| 501 | * dt_nfiles entries - but that does cause issues. |
| 502 | */ |
| 503 | return EINVAL; |
| 504 | } |
| 505 | ni = nfds * sizeof(struct pollfd); |
| 506 | if (ni > sizeof(smallfds)) { |
| 507 | fds = kmem_alloc(ni, KM_SLEEP); |
| 508 | if (fds == NULL) |
| 509 | return ENOMEM; |
| 510 | } else |
| 511 | fds = smallfds; |
| 512 | |
| 513 | error = copyin(u_fds, fds, ni); |
| 514 | if (error) |
| 515 | goto fail; |
| 516 | |
| 517 | error = sel_do_scan(SELOP_POLL, fds, nfds, ni, ts, mask, retval); |
| 518 | if (error == 0) |
| 519 | error = copyout(fds, u_fds, ni); |
| 520 | fail: |
| 521 | if (fds != smallfds) |
| 522 | kmem_free(fds, ni); |
| 523 | return (error); |
| 524 | } |
| 525 | |
| 526 | static inline int |
| 527 | pollscan(struct pollfd *fds, const int nfd, register_t *retval) |
| 528 | { |
| 529 | file_t *fp; |
| 530 | int i, n = 0, revents; |
| 531 | |
| 532 | for (i = 0; i < nfd; i++, fds++) { |
| 533 | fds->revents = 0; |
| 534 | if (fds->fd < 0) { |
| 535 | revents = 0; |
| 536 | } else if ((fp = fd_getfile(fds->fd)) == NULL) { |
| 537 | revents = POLLNVAL; |
| 538 | } else { |
| 539 | /* |
| 540 | * Perform poll: registers select request or returns |
| 541 | * the events which are set. Setup an argument for |
| 542 | * selrecord(), which is a pointer to struct pollfd. |
| 543 | */ |
| 544 | curlwp->l_selrec = (uintptr_t)fds; |
| 545 | revents = (*fp->f_ops->fo_poll)(fp, |
| 546 | fds->events | POLLERR | POLLHUP); |
| 547 | fd_putfile(fds->fd); |
| 548 | } |
| 549 | if (revents) { |
| 550 | fds->revents = revents; |
| 551 | n++; |
| 552 | } |
| 553 | } |
| 554 | *retval = n; |
| 555 | return (0); |
| 556 | } |
| 557 | |
| 558 | int |
| 559 | seltrue(dev_t dev, int events, lwp_t *l) |
| 560 | { |
| 561 | |
| 562 | return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)); |
| 563 | } |
| 564 | |
| 565 | /* |
| 566 | * Record a select request. Concurrency issues: |
| 567 | * |
| 568 | * The caller holds the same lock across calls to selrecord() and |
| 569 | * selnotify(), so we don't need to consider a concurrent wakeup |
| 570 | * while in this routine. |
| 571 | * |
| 572 | * The only activity we need to guard against is selclear(), called by |
| 573 | * another thread that is exiting sel_do_scan(). |
| 574 | * `sel_lwp' can only become non-NULL while the caller's lock is held, |
| 575 | * so it cannot become non-NULL due to a change made by another thread |
| 576 | * while we are in this routine. It can only become _NULL_ due to a |
| 577 | * call to selclear(). |
| 578 | * |
| 579 | * If it is non-NULL and != selector there is the potential for |
| 580 | * selclear() to be called by another thread. If either of those |
| 581 | * conditions are true, we're not interested in touching the `named |
| 582 | * waiter' part of the selinfo record because we need to record a |
| 583 | * collision. Hence there is no need for additional locking in this |
| 584 | * routine. |
| 585 | */ |
| 586 | void |
| 587 | selrecord(lwp_t *selector, struct selinfo *sip) |
| 588 | { |
| 589 | selcluster_t *sc; |
| 590 | lwp_t *other; |
| 591 | |
| 592 | KASSERT(selector == curlwp); |
| 593 | |
| 594 | sc = selector->l_selcluster; |
| 595 | other = sip->sel_lwp; |
| 596 | |
| 597 | if (other == selector) { |
| 598 | /* 1. We (selector) already claimed to be the first LWP. */ |
| 599 | KASSERT(sip->sel_cluster == sc); |
| 600 | } else if (other == NULL) { |
| 601 | /* |
| 602 | * 2. No first LWP, therefore we (selector) are the first. |
| 603 | * |
| 604 | * There may be unnamed waiters (collisions). Issue a memory |
| 605 | * barrier to ensure that we access sel_lwp (above) before |
| 606 | * other fields - this guards against a call to selclear(). |
| 607 | */ |
| 608 | membar_enter(); |
| 609 | sip->sel_lwp = selector; |
| 610 | SLIST_INSERT_HEAD(&selector->l_selwait, sip, sel_chain); |
| 611 | /* Copy the argument, which is for selnotify(). */ |
| 612 | sip->sel_fdinfo = selector->l_selrec; |
| 613 | /* Replace selinfo's lock with the chosen cluster's lock. */ |
| 614 | sip->sel_cluster = sc; |
| 615 | } else { |
| 616 | /* 3. Multiple waiters: record a collision. */ |
| 617 | sip->sel_collision |= sc->sc_mask; |
| 618 | KASSERT(sip->sel_cluster != NULL); |
| 619 | } |
| 620 | } |
| 621 | |
| 622 | /* |
| 623 | * sel_setevents: a helper function for selnotify(), to set the events |
| 624 | * for LWP sleeping in selcommon() or pollcommon(). |
| 625 | */ |
| 626 | static inline bool |
| 627 | sel_setevents(lwp_t *l, struct selinfo *sip, const int events) |
| 628 | { |
| 629 | const int oflag = l->l_selflag; |
| 630 | int ret = 0; |
| 631 | |
| 632 | /* |
| 633 | * If we require re-scan or it was required by somebody else, |
| 634 | * then just (re)set SEL_RESET and return. |
| 635 | */ |
| 636 | if (__predict_false(events == 0 || oflag == SEL_RESET)) { |
| 637 | l->l_selflag = SEL_RESET; |
| 638 | return true; |
| 639 | } |
| 640 | /* |
| 641 | * Direct set. Note: select state of LWP is locked. First, |
| 642 | * determine whether it is selcommon() or pollcommon(). |
| 643 | */ |
| 644 | if (l->l_selbits != NULL) { |
| 645 | const size_t ni = l->l_selni; |
| 646 | fd_mask *fds = (fd_mask *)l->l_selbits; |
| 647 | fd_mask *ofds = (fd_mask *)((char *)fds + ni * 3); |
| 648 | const int fd = sip->sel_fdinfo, fbit = 1 << (fd & __NFDMASK); |
| 649 | const int idx = fd >> __NFDSHIFT; |
| 650 | int n; |
| 651 | |
| 652 | for (n = 0; n < 3; n++) { |
| 653 | if ((fds[idx] & fbit) != 0 && |
| 654 | (ofds[idx] & fbit) == 0 && |
| 655 | (sel_flag[n] & events)) { |
| 656 | ofds[idx] |= fbit; |
| 657 | ret++; |
| 658 | } |
| 659 | fds = (fd_mask *)((char *)fds + ni); |
| 660 | ofds = (fd_mask *)((char *)ofds + ni); |
| 661 | } |
| 662 | } else { |
| 663 | struct pollfd *pfd = (void *)sip->sel_fdinfo; |
| 664 | int revents = events & (pfd->events | POLLERR | POLLHUP); |
| 665 | |
| 666 | if (revents) { |
| 667 | if (pfd->revents == 0) |
| 668 | ret = 1; |
| 669 | pfd->revents |= revents; |
| 670 | } |
| 671 | } |
| 672 | /* Check whether there are any events to return. */ |
| 673 | if (!ret) { |
| 674 | return false; |
| 675 | } |
| 676 | /* Indicate direct set and note the event (cluster lock is held). */ |
| 677 | l->l_selflag = SEL_EVENT; |
| 678 | l->l_selret += ret; |
| 679 | return true; |
| 680 | } |
| 681 | |
| 682 | /* |
| 683 | * Do a wakeup when a selectable event occurs. Concurrency issues: |
| 684 | * |
| 685 | * As per selrecord(), the caller's object lock is held. If there |
| 686 | * is a named waiter, we must acquire the associated selcluster's lock |
| 687 | * in order to synchronize with selclear() and pollers going to sleep |
| 688 | * in sel_do_scan(). |
| 689 | * |
| 690 | * sip->sel_cluser cannot change at this point, as it is only changed |
| 691 | * in selrecord(), and concurrent calls to selrecord() are locked |
| 692 | * out by the caller. |
| 693 | */ |
| 694 | void |
| 695 | selnotify(struct selinfo *sip, int events, long knhint) |
| 696 | { |
| 697 | selcluster_t *sc; |
| 698 | uint32_t mask; |
| 699 | int index, oflag; |
| 700 | lwp_t *l; |
| 701 | kmutex_t *lock; |
| 702 | |
| 703 | KNOTE(&sip->sel_klist, knhint); |
| 704 | |
| 705 | if (sip->sel_lwp != NULL) { |
| 706 | /* One named LWP is waiting. */ |
| 707 | sc = sip->sel_cluster; |
| 708 | lock = sc->sc_lock; |
| 709 | mutex_spin_enter(lock); |
| 710 | /* Still there? */ |
| 711 | if (sip->sel_lwp != NULL) { |
| 712 | /* |
| 713 | * Set the events for our LWP and indicate that. |
| 714 | * Otherwise, request for a full re-scan. |
| 715 | */ |
| 716 | l = sip->sel_lwp; |
| 717 | oflag = l->l_selflag; |
| 718 | |
| 719 | if (!direct_select) { |
| 720 | l->l_selflag = SEL_RESET; |
| 721 | } else if (!sel_setevents(l, sip, events)) { |
| 722 | /* No events to return. */ |
| 723 | mutex_spin_exit(lock); |
| 724 | return; |
| 725 | } |
| 726 | |
| 727 | /* |
| 728 | * If thread is sleeping, wake it up. If it's not |
| 729 | * yet asleep, it will notice the change in state |
| 730 | * and will re-poll the descriptors. |
| 731 | */ |
| 732 | if (oflag == SEL_BLOCKING && l->l_mutex == lock) { |
| 733 | KASSERT(l->l_wchan == sc); |
| 734 | sleepq_unsleep(l, false); |
| 735 | } |
| 736 | } |
| 737 | mutex_spin_exit(lock); |
| 738 | } |
| 739 | |
| 740 | if ((mask = sip->sel_collision) != 0) { |
| 741 | /* |
| 742 | * There was a collision (multiple waiters): we must |
| 743 | * inform all potentially interested waiters. |
| 744 | */ |
| 745 | sip->sel_collision = 0; |
| 746 | do { |
| 747 | index = ffs(mask) - 1; |
| 748 | mask &= ~(1 << index); |
| 749 | sc = selcluster[index]; |
| 750 | lock = sc->sc_lock; |
| 751 | mutex_spin_enter(lock); |
| 752 | sc->sc_ncoll++; |
| 753 | sleepq_wake(&sc->sc_sleepq, sc, (u_int)-1, lock); |
| 754 | } while (__predict_false(mask != 0)); |
| 755 | } |
| 756 | } |
| 757 | |
| 758 | /* |
| 759 | * Remove an LWP from all objects that it is waiting for. Concurrency |
| 760 | * issues: |
| 761 | * |
| 762 | * The object owner's (e.g. device driver) lock is not held here. Calls |
| 763 | * can be made to selrecord() and we do not synchronize against those |
| 764 | * directly using locks. However, we use `sel_lwp' to lock out changes. |
| 765 | * Before clearing it we must use memory barriers to ensure that we can |
| 766 | * safely traverse the list of selinfo records. |
| 767 | */ |
| 768 | static void |
| 769 | selclear(void) |
| 770 | { |
| 771 | struct selinfo *sip, *next; |
| 772 | selcluster_t *sc; |
| 773 | lwp_t *l; |
| 774 | kmutex_t *lock; |
| 775 | |
| 776 | l = curlwp; |
| 777 | sc = l->l_selcluster; |
| 778 | lock = sc->sc_lock; |
| 779 | |
| 780 | mutex_spin_enter(lock); |
| 781 | for (sip = SLIST_FIRST(&l->l_selwait); sip != NULL; sip = next) { |
| 782 | KASSERT(sip->sel_lwp == l); |
| 783 | KASSERT(sip->sel_cluster == l->l_selcluster); |
| 784 | |
| 785 | /* |
| 786 | * Read link to next selinfo record, if any. |
| 787 | * It's no longer safe to touch `sip' after clearing |
| 788 | * `sel_lwp', so ensure that the read of `sel_chain' |
| 789 | * completes before the clearing of sel_lwp becomes |
| 790 | * globally visible. |
| 791 | */ |
| 792 | next = SLIST_NEXT(sip, sel_chain); |
| 793 | membar_exit(); |
| 794 | /* Release the record for another named waiter to use. */ |
| 795 | sip->sel_lwp = NULL; |
| 796 | } |
| 797 | mutex_spin_exit(lock); |
| 798 | } |
| 799 | |
| 800 | /* |
| 801 | * Initialize the select/poll system calls. Called once for each |
| 802 | * CPU in the system, as they are attached. |
| 803 | */ |
| 804 | void |
| 805 | selsysinit(struct cpu_info *ci) |
| 806 | { |
| 807 | selcluster_t *sc; |
| 808 | u_int index; |
| 809 | |
| 810 | /* If already a cluster in place for this bit, re-use. */ |
| 811 | index = cpu_index(ci) & SELCLUSTERMASK; |
| 812 | sc = selcluster[index]; |
| 813 | if (sc == NULL) { |
| 814 | sc = kmem_alloc(roundup2(sizeof(selcluster_t), |
| 815 | coherency_unit) + coherency_unit, KM_SLEEP); |
| 816 | sc = (void *)roundup2((uintptr_t)sc, coherency_unit); |
| 817 | sc->sc_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED); |
| 818 | sleepq_init(&sc->sc_sleepq); |
| 819 | sc->sc_ncoll = 0; |
| 820 | sc->sc_mask = (1 << index); |
| 821 | selcluster[index] = sc; |
| 822 | } |
| 823 | ci->ci_data.cpu_selcluster = sc; |
| 824 | } |
| 825 | |
| 826 | /* |
| 827 | * Initialize a selinfo record. |
| 828 | */ |
| 829 | void |
| 830 | selinit(struct selinfo *sip) |
| 831 | { |
| 832 | |
| 833 | memset(sip, 0, sizeof(*sip)); |
| 834 | } |
| 835 | |
| 836 | /* |
| 837 | * Destroy a selinfo record. The owning object must not gain new |
| 838 | * references while this is in progress: all activity on the record |
| 839 | * must be stopped. |
| 840 | * |
| 841 | * Concurrency issues: we only need guard against a call to selclear() |
| 842 | * by a thread exiting sel_do_scan(). The caller has prevented further |
| 843 | * references being made to the selinfo record via selrecord(), and it |
| 844 | * will not call selnotify() again. |
| 845 | */ |
| 846 | void |
| 847 | seldestroy(struct selinfo *sip) |
| 848 | { |
| 849 | selcluster_t *sc; |
| 850 | kmutex_t *lock; |
| 851 | lwp_t *l; |
| 852 | |
| 853 | if (sip->sel_lwp == NULL) |
| 854 | return; |
| 855 | |
| 856 | /* |
| 857 | * Lock out selclear(). The selcluster pointer can't change while |
| 858 | * we are here since it is only ever changed in selrecord(), |
| 859 | * and that will not be entered again for this record because |
| 860 | * it is dying. |
| 861 | */ |
| 862 | KASSERT(sip->sel_cluster != NULL); |
| 863 | sc = sip->sel_cluster; |
| 864 | lock = sc->sc_lock; |
| 865 | mutex_spin_enter(lock); |
| 866 | if ((l = sip->sel_lwp) != NULL) { |
| 867 | /* |
| 868 | * This should rarely happen, so although SLIST_REMOVE() |
| 869 | * is slow, using it here is not a problem. |
| 870 | */ |
| 871 | KASSERT(l->l_selcluster == sc); |
| 872 | SLIST_REMOVE(&l->l_selwait, sip, selinfo, sel_chain); |
| 873 | sip->sel_lwp = NULL; |
| 874 | } |
| 875 | mutex_spin_exit(lock); |
| 876 | } |
| 877 | |
| 878 | /* |
| 879 | * System control nodes. |
| 880 | */ |
| 881 | SYSCTL_SETUP(sysctl_select_setup, "sysctl select setup" ) |
| 882 | { |
| 883 | |
| 884 | sysctl_createv(clog, 0, NULL, NULL, |
| 885 | CTLFLAG_PERMANENT | CTLFLAG_READWRITE, |
| 886 | CTLTYPE_INT, "direct_select" , |
| 887 | SYSCTL_DESCR("Enable/disable direct select (for testing)" ), |
| 888 | NULL, 0, &direct_select, 0, |
| 889 | CTL_KERN, CTL_CREATE, CTL_EOL); |
| 890 | } |
| 891 | |