| 1 | /* $NetBSD: tcp_input.c,v 1.349 2016/11/15 22:23:09 mrg Exp $ */ |
| 2 | |
| 3 | /* |
| 4 | * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. |
| 5 | * All rights reserved. |
| 6 | * |
| 7 | * Redistribution and use in source and binary forms, with or without |
| 8 | * modification, are permitted provided that the following conditions |
| 9 | * are met: |
| 10 | * 1. Redistributions of source code must retain the above copyright |
| 11 | * notice, this list of conditions and the following disclaimer. |
| 12 | * 2. Redistributions in binary form must reproduce the above copyright |
| 13 | * notice, this list of conditions and the following disclaimer in the |
| 14 | * documentation and/or other materials provided with the distribution. |
| 15 | * 3. Neither the name of the project nor the names of its contributors |
| 16 | * may be used to endorse or promote products derived from this software |
| 17 | * without specific prior written permission. |
| 18 | * |
| 19 | * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND |
| 20 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| 21 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| 22 | * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE |
| 23 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
| 24 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
| 25 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
| 26 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
| 27 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
| 28 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
| 29 | * SUCH DAMAGE. |
| 30 | */ |
| 31 | |
| 32 | /* |
| 33 | * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995 |
| 34 | * |
| 35 | * NRL grants permission for redistribution and use in source and binary |
| 36 | * forms, with or without modification, of the software and documentation |
| 37 | * created at NRL provided that the following conditions are met: |
| 38 | * |
| 39 | * 1. Redistributions of source code must retain the above copyright |
| 40 | * notice, this list of conditions and the following disclaimer. |
| 41 | * 2. Redistributions in binary form must reproduce the above copyright |
| 42 | * notice, this list of conditions and the following disclaimer in the |
| 43 | * documentation and/or other materials provided with the distribution. |
| 44 | * 3. All advertising materials mentioning features or use of this software |
| 45 | * must display the following acknowledgements: |
| 46 | * This product includes software developed by the University of |
| 47 | * California, Berkeley and its contributors. |
| 48 | * This product includes software developed at the Information |
| 49 | * Technology Division, US Naval Research Laboratory. |
| 50 | * 4. Neither the name of the NRL nor the names of its contributors |
| 51 | * may be used to endorse or promote products derived from this software |
| 52 | * without specific prior written permission. |
| 53 | * |
| 54 | * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS |
| 55 | * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED |
| 56 | * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A |
| 57 | * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR |
| 58 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
| 59 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
| 60 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
| 61 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF |
| 62 | * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING |
| 63 | * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
| 64 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| 65 | * |
| 66 | * The views and conclusions contained in the software and documentation |
| 67 | * are those of the authors and should not be interpreted as representing |
| 68 | * official policies, either expressed or implied, of the US Naval |
| 69 | * Research Laboratory (NRL). |
| 70 | */ |
| 71 | |
| 72 | /*- |
| 73 | * Copyright (c) 1997, 1998, 1999, 2001, 2005, 2006, |
| 74 | * 2011 The NetBSD Foundation, Inc. |
| 75 | * All rights reserved. |
| 76 | * |
| 77 | * This code is derived from software contributed to The NetBSD Foundation |
| 78 | * by Coyote Point Systems, Inc. |
| 79 | * This code is derived from software contributed to The NetBSD Foundation |
| 80 | * by Jason R. Thorpe and Kevin M. Lahey of the Numerical Aerospace Simulation |
| 81 | * Facility, NASA Ames Research Center. |
| 82 | * This code is derived from software contributed to The NetBSD Foundation |
| 83 | * by Charles M. Hannum. |
| 84 | * This code is derived from software contributed to The NetBSD Foundation |
| 85 | * by Rui Paulo. |
| 86 | * |
| 87 | * Redistribution and use in source and binary forms, with or without |
| 88 | * modification, are permitted provided that the following conditions |
| 89 | * are met: |
| 90 | * 1. Redistributions of source code must retain the above copyright |
| 91 | * notice, this list of conditions and the following disclaimer. |
| 92 | * 2. Redistributions in binary form must reproduce the above copyright |
| 93 | * notice, this list of conditions and the following disclaimer in the |
| 94 | * documentation and/or other materials provided with the distribution. |
| 95 | * |
| 96 | * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS |
| 97 | * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED |
| 98 | * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
| 99 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS |
| 100 | * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
| 101 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
| 102 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
| 103 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
| 104 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
| 105 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
| 106 | * POSSIBILITY OF SUCH DAMAGE. |
| 107 | */ |
| 108 | |
| 109 | /* |
| 110 | * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 |
| 111 | * The Regents of the University of California. All rights reserved. |
| 112 | * |
| 113 | * Redistribution and use in source and binary forms, with or without |
| 114 | * modification, are permitted provided that the following conditions |
| 115 | * are met: |
| 116 | * 1. Redistributions of source code must retain the above copyright |
| 117 | * notice, this list of conditions and the following disclaimer. |
| 118 | * 2. Redistributions in binary form must reproduce the above copyright |
| 119 | * notice, this list of conditions and the following disclaimer in the |
| 120 | * documentation and/or other materials provided with the distribution. |
| 121 | * 3. Neither the name of the University nor the names of its contributors |
| 122 | * may be used to endorse or promote products derived from this software |
| 123 | * without specific prior written permission. |
| 124 | * |
| 125 | * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND |
| 126 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| 127 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| 128 | * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE |
| 129 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
| 130 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
| 131 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
| 132 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
| 133 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
| 134 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
| 135 | * SUCH DAMAGE. |
| 136 | * |
| 137 | * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95 |
| 138 | */ |
| 139 | |
| 140 | /* |
| 141 | * TODO list for SYN cache stuff: |
| 142 | * |
| 143 | * Find room for a "state" field, which is needed to keep a |
| 144 | * compressed state for TIME_WAIT TCBs. It's been noted already |
| 145 | * that this is fairly important for very high-volume web and |
| 146 | * mail servers, which use a large number of short-lived |
| 147 | * connections. |
| 148 | */ |
| 149 | |
| 150 | #include <sys/cdefs.h> |
| 151 | __KERNEL_RCSID(0, "$NetBSD: tcp_input.c,v 1.349 2016/11/15 22:23:09 mrg Exp $" ); |
| 152 | |
| 153 | #ifdef _KERNEL_OPT |
| 154 | #include "opt_inet.h" |
| 155 | #include "opt_ipsec.h" |
| 156 | #include "opt_inet_csum.h" |
| 157 | #include "opt_tcp_debug.h" |
| 158 | #endif |
| 159 | |
| 160 | #include <sys/param.h> |
| 161 | #include <sys/systm.h> |
| 162 | #include <sys/malloc.h> |
| 163 | #include <sys/mbuf.h> |
| 164 | #include <sys/protosw.h> |
| 165 | #include <sys/socket.h> |
| 166 | #include <sys/socketvar.h> |
| 167 | #include <sys/errno.h> |
| 168 | #include <sys/syslog.h> |
| 169 | #include <sys/pool.h> |
| 170 | #include <sys/domain.h> |
| 171 | #include <sys/kernel.h> |
| 172 | #ifdef TCP_SIGNATURE |
| 173 | #include <sys/md5.h> |
| 174 | #endif |
| 175 | #include <sys/lwp.h> /* for lwp0 */ |
| 176 | #include <sys/cprng.h> |
| 177 | |
| 178 | #include <net/if.h> |
| 179 | #include <net/if_types.h> |
| 180 | |
| 181 | #include <netinet/in.h> |
| 182 | #include <netinet/in_systm.h> |
| 183 | #include <netinet/ip.h> |
| 184 | #include <netinet/in_pcb.h> |
| 185 | #include <netinet/in_var.h> |
| 186 | #include <netinet/ip_var.h> |
| 187 | #include <netinet/in_offload.h> |
| 188 | |
| 189 | #ifdef INET6 |
| 190 | #ifndef INET |
| 191 | #include <netinet/in.h> |
| 192 | #endif |
| 193 | #include <netinet/ip6.h> |
| 194 | #include <netinet6/ip6_var.h> |
| 195 | #include <netinet6/in6_pcb.h> |
| 196 | #include <netinet6/ip6_var.h> |
| 197 | #include <netinet6/in6_var.h> |
| 198 | #include <netinet/icmp6.h> |
| 199 | #include <netinet6/nd6.h> |
| 200 | #ifdef TCP_SIGNATURE |
| 201 | #include <netinet6/scope6_var.h> |
| 202 | #endif |
| 203 | #endif |
| 204 | |
| 205 | #ifndef INET6 |
| 206 | /* always need ip6.h for IP6_EXTHDR_GET */ |
| 207 | #include <netinet/ip6.h> |
| 208 | #endif |
| 209 | |
| 210 | #include <netinet/tcp.h> |
| 211 | #include <netinet/tcp_fsm.h> |
| 212 | #include <netinet/tcp_seq.h> |
| 213 | #include <netinet/tcp_timer.h> |
| 214 | #include <netinet/tcp_var.h> |
| 215 | #include <netinet/tcp_private.h> |
| 216 | #include <netinet/tcpip.h> |
| 217 | #include <netinet/tcp_congctl.h> |
| 218 | #include <netinet/tcp_debug.h> |
| 219 | |
| 220 | #ifdef INET6 |
| 221 | #include "faith.h" |
| 222 | #if defined(NFAITH) && NFAITH > 0 |
| 223 | #include <net/if_faith.h> |
| 224 | #endif |
| 225 | #endif /* INET6 */ |
| 226 | |
| 227 | #ifdef IPSEC |
| 228 | #include <netipsec/ipsec.h> |
| 229 | #include <netipsec/ipsec_var.h> |
| 230 | #include <netipsec/ipsec_private.h> |
| 231 | #include <netipsec/key.h> |
| 232 | #ifdef INET6 |
| 233 | #include <netipsec/ipsec6.h> |
| 234 | #endif |
| 235 | #endif /* IPSEC*/ |
| 236 | |
| 237 | #include <netinet/tcp_vtw.h> |
| 238 | |
| 239 | int tcprexmtthresh = 3; |
| 240 | int tcp_log_refused; |
| 241 | |
| 242 | int tcp_do_autorcvbuf = 1; |
| 243 | int tcp_autorcvbuf_inc = 16 * 1024; |
| 244 | int tcp_autorcvbuf_max = 256 * 1024; |
| 245 | int tcp_msl = (TCPTV_MSL / PR_SLOWHZ); |
| 246 | |
| 247 | static int tcp_rst_ppslim_count = 0; |
| 248 | static struct timeval tcp_rst_ppslim_last; |
| 249 | static int tcp_ackdrop_ppslim_count = 0; |
| 250 | static struct timeval tcp_ackdrop_ppslim_last; |
| 251 | |
| 252 | #define TCP_PAWS_IDLE (24U * 24 * 60 * 60 * PR_SLOWHZ) |
| 253 | |
| 254 | /* for modulo comparisons of timestamps */ |
| 255 | #define TSTMP_LT(a,b) ((int)((a)-(b)) < 0) |
| 256 | #define TSTMP_GEQ(a,b) ((int)((a)-(b)) >= 0) |
| 257 | |
| 258 | /* |
| 259 | * Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. |
| 260 | */ |
| 261 | #ifdef INET6 |
| 262 | static inline void |
| 263 | nd6_hint(struct tcpcb *tp) |
| 264 | { |
| 265 | struct rtentry *rt; |
| 266 | |
| 267 | if (tp != NULL && tp->t_in6pcb != NULL && tp->t_family == AF_INET6 && |
| 268 | (rt = rtcache_validate(&tp->t_in6pcb->in6p_route)) != NULL) |
| 269 | nd6_nud_hint(rt); |
| 270 | } |
| 271 | #else |
| 272 | static inline void |
| 273 | nd6_hint(struct tcpcb *tp) |
| 274 | { |
| 275 | } |
| 276 | #endif |
| 277 | |
| 278 | /* |
| 279 | * Compute ACK transmission behavior. Delay the ACK unless |
| 280 | * we have already delayed an ACK (must send an ACK every two segments). |
| 281 | * We also ACK immediately if we received a PUSH and the ACK-on-PUSH |
| 282 | * option is enabled. |
| 283 | */ |
| 284 | static void |
| 285 | tcp_setup_ack(struct tcpcb *tp, const struct tcphdr *th) |
| 286 | { |
| 287 | |
| 288 | if (tp->t_flags & TF_DELACK || |
| 289 | (tcp_ack_on_push && th->th_flags & TH_PUSH)) |
| 290 | tp->t_flags |= TF_ACKNOW; |
| 291 | else |
| 292 | TCP_SET_DELACK(tp); |
| 293 | } |
| 294 | |
| 295 | static void |
| 296 | icmp_check(struct tcpcb *tp, const struct tcphdr *th, int acked) |
| 297 | { |
| 298 | |
| 299 | /* |
| 300 | * If we had a pending ICMP message that refers to data that have |
| 301 | * just been acknowledged, disregard the recorded ICMP message. |
| 302 | */ |
| 303 | if ((tp->t_flags & TF_PMTUD_PEND) && |
| 304 | SEQ_GT(th->th_ack, tp->t_pmtud_th_seq)) |
| 305 | tp->t_flags &= ~TF_PMTUD_PEND; |
| 306 | |
| 307 | /* |
| 308 | * Keep track of the largest chunk of data |
| 309 | * acknowledged since last PMTU update |
| 310 | */ |
| 311 | if (tp->t_pmtud_mss_acked < acked) |
| 312 | tp->t_pmtud_mss_acked = acked; |
| 313 | } |
| 314 | |
| 315 | /* |
| 316 | * Convert TCP protocol fields to host order for easier processing. |
| 317 | */ |
| 318 | static void |
| 319 | tcp_fields_to_host(struct tcphdr *th) |
| 320 | { |
| 321 | |
| 322 | NTOHL(th->th_seq); |
| 323 | NTOHL(th->th_ack); |
| 324 | NTOHS(th->th_win); |
| 325 | NTOHS(th->th_urp); |
| 326 | } |
| 327 | |
| 328 | /* |
| 329 | * ... and reverse the above. |
| 330 | */ |
| 331 | static void |
| 332 | tcp_fields_to_net(struct tcphdr *th) |
| 333 | { |
| 334 | |
| 335 | HTONL(th->th_seq); |
| 336 | HTONL(th->th_ack); |
| 337 | HTONS(th->th_win); |
| 338 | HTONS(th->th_urp); |
| 339 | } |
| 340 | |
| 341 | #ifdef TCP_CSUM_COUNTERS |
| 342 | #include <sys/device.h> |
| 343 | |
| 344 | #if defined(INET) |
| 345 | extern struct evcnt tcp_hwcsum_ok; |
| 346 | extern struct evcnt tcp_hwcsum_bad; |
| 347 | extern struct evcnt tcp_hwcsum_data; |
| 348 | extern struct evcnt tcp_swcsum; |
| 349 | #endif /* defined(INET) */ |
| 350 | #if defined(INET6) |
| 351 | extern struct evcnt tcp6_hwcsum_ok; |
| 352 | extern struct evcnt tcp6_hwcsum_bad; |
| 353 | extern struct evcnt tcp6_hwcsum_data; |
| 354 | extern struct evcnt tcp6_swcsum; |
| 355 | #endif /* defined(INET6) */ |
| 356 | |
| 357 | #define TCP_CSUM_COUNTER_INCR(ev) (ev)->ev_count++ |
| 358 | |
| 359 | #else |
| 360 | |
| 361 | #define TCP_CSUM_COUNTER_INCR(ev) /* nothing */ |
| 362 | |
| 363 | #endif /* TCP_CSUM_COUNTERS */ |
| 364 | |
| 365 | #ifdef TCP_REASS_COUNTERS |
| 366 | #include <sys/device.h> |
| 367 | |
| 368 | extern struct evcnt tcp_reass_; |
| 369 | extern struct evcnt tcp_reass_empty; |
| 370 | extern struct evcnt tcp_reass_iteration[8]; |
| 371 | extern struct evcnt tcp_reass_prependfirst; |
| 372 | extern struct evcnt tcp_reass_prepend; |
| 373 | extern struct evcnt tcp_reass_insert; |
| 374 | extern struct evcnt tcp_reass_inserttail; |
| 375 | extern struct evcnt tcp_reass_append; |
| 376 | extern struct evcnt tcp_reass_appendtail; |
| 377 | extern struct evcnt tcp_reass_overlaptail; |
| 378 | extern struct evcnt tcp_reass_overlapfront; |
| 379 | extern struct evcnt tcp_reass_segdup; |
| 380 | extern struct evcnt tcp_reass_fragdup; |
| 381 | |
| 382 | #define TCP_REASS_COUNTER_INCR(ev) (ev)->ev_count++ |
| 383 | |
| 384 | #else |
| 385 | |
| 386 | #define TCP_REASS_COUNTER_INCR(ev) /* nothing */ |
| 387 | |
| 388 | #endif /* TCP_REASS_COUNTERS */ |
| 389 | |
| 390 | static int tcp_reass(struct tcpcb *, const struct tcphdr *, struct mbuf *, |
| 391 | int *); |
| 392 | static int tcp_dooptions(struct tcpcb *, const u_char *, int, |
| 393 | struct tcphdr *, struct mbuf *, int, struct tcp_opt_info *); |
| 394 | |
| 395 | #ifdef INET |
| 396 | static void tcp4_log_refused(const struct ip *, const struct tcphdr *); |
| 397 | #endif |
| 398 | #ifdef INET6 |
| 399 | static void tcp6_log_refused(const struct ip6_hdr *, const struct tcphdr *); |
| 400 | #endif |
| 401 | |
| 402 | #define TRAVERSE(x) while ((x)->m_next) (x) = (x)->m_next |
| 403 | |
| 404 | #if defined(MBUFTRACE) |
| 405 | struct mowner tcp_reass_mowner = MOWNER_INIT("tcp" , "reass" ); |
| 406 | #endif /* defined(MBUFTRACE) */ |
| 407 | |
| 408 | static struct pool tcpipqent_pool; |
| 409 | |
| 410 | void |
| 411 | tcpipqent_init(void) |
| 412 | { |
| 413 | |
| 414 | pool_init(&tcpipqent_pool, sizeof(struct ipqent), 0, 0, 0, "tcpipqepl" , |
| 415 | NULL, IPL_VM); |
| 416 | } |
| 417 | |
| 418 | struct ipqent * |
| 419 | tcpipqent_alloc(void) |
| 420 | { |
| 421 | struct ipqent *ipqe; |
| 422 | int s; |
| 423 | |
| 424 | s = splvm(); |
| 425 | ipqe = pool_get(&tcpipqent_pool, PR_NOWAIT); |
| 426 | splx(s); |
| 427 | |
| 428 | return ipqe; |
| 429 | } |
| 430 | |
| 431 | void |
| 432 | tcpipqent_free(struct ipqent *ipqe) |
| 433 | { |
| 434 | int s; |
| 435 | |
| 436 | s = splvm(); |
| 437 | pool_put(&tcpipqent_pool, ipqe); |
| 438 | splx(s); |
| 439 | } |
| 440 | |
| 441 | static int |
| 442 | tcp_reass(struct tcpcb *tp, const struct tcphdr *th, struct mbuf *m, int *tlen) |
| 443 | { |
| 444 | struct ipqent *p, *q, *nq, *tiqe = NULL; |
| 445 | struct socket *so = NULL; |
| 446 | int pkt_flags; |
| 447 | tcp_seq pkt_seq; |
| 448 | unsigned pkt_len; |
| 449 | u_long rcvpartdupbyte = 0; |
| 450 | u_long rcvoobyte; |
| 451 | #ifdef TCP_REASS_COUNTERS |
| 452 | u_int count = 0; |
| 453 | #endif |
| 454 | uint64_t *tcps; |
| 455 | |
| 456 | if (tp->t_inpcb) |
| 457 | so = tp->t_inpcb->inp_socket; |
| 458 | #ifdef INET6 |
| 459 | else if (tp->t_in6pcb) |
| 460 | so = tp->t_in6pcb->in6p_socket; |
| 461 | #endif |
| 462 | |
| 463 | TCP_REASS_LOCK_CHECK(tp); |
| 464 | |
| 465 | /* |
| 466 | * Call with th==0 after become established to |
| 467 | * force pre-ESTABLISHED data up to user socket. |
| 468 | */ |
| 469 | if (th == 0) |
| 470 | goto present; |
| 471 | |
| 472 | m_claimm(m, &tcp_reass_mowner); |
| 473 | |
| 474 | rcvoobyte = *tlen; |
| 475 | /* |
| 476 | * Copy these to local variables because the tcpiphdr |
| 477 | * gets munged while we are collapsing mbufs. |
| 478 | */ |
| 479 | pkt_seq = th->th_seq; |
| 480 | pkt_len = *tlen; |
| 481 | pkt_flags = th->th_flags; |
| 482 | |
| 483 | TCP_REASS_COUNTER_INCR(&tcp_reass_); |
| 484 | |
| 485 | if ((p = TAILQ_LAST(&tp->segq, ipqehead)) != NULL) { |
| 486 | /* |
| 487 | * When we miss a packet, the vast majority of time we get |
| 488 | * packets that follow it in order. So optimize for that. |
| 489 | */ |
| 490 | if (pkt_seq == p->ipqe_seq + p->ipqe_len) { |
| 491 | p->ipqe_len += pkt_len; |
| 492 | p->ipqe_flags |= pkt_flags; |
| 493 | m_cat(p->ipre_mlast, m); |
| 494 | TRAVERSE(p->ipre_mlast); |
| 495 | m = NULL; |
| 496 | tiqe = p; |
| 497 | TAILQ_REMOVE(&tp->timeq, p, ipqe_timeq); |
| 498 | TCP_REASS_COUNTER_INCR(&tcp_reass_appendtail); |
| 499 | goto skip_replacement; |
| 500 | } |
| 501 | /* |
| 502 | * While we're here, if the pkt is completely beyond |
| 503 | * anything we have, just insert it at the tail. |
| 504 | */ |
| 505 | if (SEQ_GT(pkt_seq, p->ipqe_seq + p->ipqe_len)) { |
| 506 | TCP_REASS_COUNTER_INCR(&tcp_reass_inserttail); |
| 507 | goto insert_it; |
| 508 | } |
| 509 | } |
| 510 | |
| 511 | q = TAILQ_FIRST(&tp->segq); |
| 512 | |
| 513 | if (q != NULL) { |
| 514 | /* |
| 515 | * If this segment immediately precedes the first out-of-order |
| 516 | * block, simply slap the segment in front of it and (mostly) |
| 517 | * skip the complicated logic. |
| 518 | */ |
| 519 | if (pkt_seq + pkt_len == q->ipqe_seq) { |
| 520 | q->ipqe_seq = pkt_seq; |
| 521 | q->ipqe_len += pkt_len; |
| 522 | q->ipqe_flags |= pkt_flags; |
| 523 | m_cat(m, q->ipqe_m); |
| 524 | q->ipqe_m = m; |
| 525 | q->ipre_mlast = m; /* last mbuf may have changed */ |
| 526 | TRAVERSE(q->ipre_mlast); |
| 527 | tiqe = q; |
| 528 | TAILQ_REMOVE(&tp->timeq, q, ipqe_timeq); |
| 529 | TCP_REASS_COUNTER_INCR(&tcp_reass_prependfirst); |
| 530 | goto skip_replacement; |
| 531 | } |
| 532 | } else { |
| 533 | TCP_REASS_COUNTER_INCR(&tcp_reass_empty); |
| 534 | } |
| 535 | |
| 536 | /* |
| 537 | * Find a segment which begins after this one does. |
| 538 | */ |
| 539 | for (p = NULL; q != NULL; q = nq) { |
| 540 | nq = TAILQ_NEXT(q, ipqe_q); |
| 541 | #ifdef TCP_REASS_COUNTERS |
| 542 | count++; |
| 543 | #endif |
| 544 | /* |
| 545 | * If the received segment is just right after this |
| 546 | * fragment, merge the two together and then check |
| 547 | * for further overlaps. |
| 548 | */ |
| 549 | if (q->ipqe_seq + q->ipqe_len == pkt_seq) { |
| 550 | #ifdef TCPREASS_DEBUG |
| 551 | printf("tcp_reass[%p]: concat %u:%u(%u) to %u:%u(%u)\n" , |
| 552 | tp, pkt_seq, pkt_seq + pkt_len, pkt_len, |
| 553 | q->ipqe_seq, q->ipqe_seq + q->ipqe_len, q->ipqe_len); |
| 554 | #endif |
| 555 | pkt_len += q->ipqe_len; |
| 556 | pkt_flags |= q->ipqe_flags; |
| 557 | pkt_seq = q->ipqe_seq; |
| 558 | m_cat(q->ipre_mlast, m); |
| 559 | TRAVERSE(q->ipre_mlast); |
| 560 | m = q->ipqe_m; |
| 561 | TCP_REASS_COUNTER_INCR(&tcp_reass_append); |
| 562 | goto free_ipqe; |
| 563 | } |
| 564 | /* |
| 565 | * If the received segment is completely past this |
| 566 | * fragment, we need to go the next fragment. |
| 567 | */ |
| 568 | if (SEQ_LT(q->ipqe_seq + q->ipqe_len, pkt_seq)) { |
| 569 | p = q; |
| 570 | continue; |
| 571 | } |
| 572 | /* |
| 573 | * If the fragment is past the received segment, |
| 574 | * it (or any following) can't be concatenated. |
| 575 | */ |
| 576 | if (SEQ_GT(q->ipqe_seq, pkt_seq + pkt_len)) { |
| 577 | TCP_REASS_COUNTER_INCR(&tcp_reass_insert); |
| 578 | break; |
| 579 | } |
| 580 | |
| 581 | /* |
| 582 | * We've received all the data in this segment before. |
| 583 | * mark it as a duplicate and return. |
| 584 | */ |
| 585 | if (SEQ_LEQ(q->ipqe_seq, pkt_seq) && |
| 586 | SEQ_GEQ(q->ipqe_seq + q->ipqe_len, pkt_seq + pkt_len)) { |
| 587 | tcps = TCP_STAT_GETREF(); |
| 588 | tcps[TCP_STAT_RCVDUPPACK]++; |
| 589 | tcps[TCP_STAT_RCVDUPBYTE] += pkt_len; |
| 590 | TCP_STAT_PUTREF(); |
| 591 | tcp_new_dsack(tp, pkt_seq, pkt_len); |
| 592 | m_freem(m); |
| 593 | if (tiqe != NULL) { |
| 594 | tcpipqent_free(tiqe); |
| 595 | } |
| 596 | TCP_REASS_COUNTER_INCR(&tcp_reass_segdup); |
| 597 | goto out; |
| 598 | } |
| 599 | /* |
| 600 | * Received segment completely overlaps this fragment |
| 601 | * so we drop the fragment (this keeps the temporal |
| 602 | * ordering of segments correct). |
| 603 | */ |
| 604 | if (SEQ_GEQ(q->ipqe_seq, pkt_seq) && |
| 605 | SEQ_LEQ(q->ipqe_seq + q->ipqe_len, pkt_seq + pkt_len)) { |
| 606 | rcvpartdupbyte += q->ipqe_len; |
| 607 | m_freem(q->ipqe_m); |
| 608 | TCP_REASS_COUNTER_INCR(&tcp_reass_fragdup); |
| 609 | goto free_ipqe; |
| 610 | } |
| 611 | /* |
| 612 | * RX'ed segment extends past the end of the |
| 613 | * fragment. Drop the overlapping bytes. Then |
| 614 | * merge the fragment and segment then treat as |
| 615 | * a longer received packet. |
| 616 | */ |
| 617 | if (SEQ_LT(q->ipqe_seq, pkt_seq) && |
| 618 | SEQ_GT(q->ipqe_seq + q->ipqe_len, pkt_seq)) { |
| 619 | int overlap = q->ipqe_seq + q->ipqe_len - pkt_seq; |
| 620 | #ifdef TCPREASS_DEBUG |
| 621 | printf("tcp_reass[%p]: trim starting %d bytes of %u:%u(%u)\n" , |
| 622 | tp, overlap, |
| 623 | pkt_seq, pkt_seq + pkt_len, pkt_len); |
| 624 | #endif |
| 625 | m_adj(m, overlap); |
| 626 | rcvpartdupbyte += overlap; |
| 627 | m_cat(q->ipre_mlast, m); |
| 628 | TRAVERSE(q->ipre_mlast); |
| 629 | m = q->ipqe_m; |
| 630 | pkt_seq = q->ipqe_seq; |
| 631 | pkt_len += q->ipqe_len - overlap; |
| 632 | rcvoobyte -= overlap; |
| 633 | TCP_REASS_COUNTER_INCR(&tcp_reass_overlaptail); |
| 634 | goto free_ipqe; |
| 635 | } |
| 636 | /* |
| 637 | * RX'ed segment extends past the front of the |
| 638 | * fragment. Drop the overlapping bytes on the |
| 639 | * received packet. The packet will then be |
| 640 | * contatentated with this fragment a bit later. |
| 641 | */ |
| 642 | if (SEQ_GT(q->ipqe_seq, pkt_seq) && |
| 643 | SEQ_LT(q->ipqe_seq, pkt_seq + pkt_len)) { |
| 644 | int overlap = pkt_seq + pkt_len - q->ipqe_seq; |
| 645 | #ifdef TCPREASS_DEBUG |
| 646 | printf("tcp_reass[%p]: trim trailing %d bytes of %u:%u(%u)\n" , |
| 647 | tp, overlap, |
| 648 | pkt_seq, pkt_seq + pkt_len, pkt_len); |
| 649 | #endif |
| 650 | m_adj(m, -overlap); |
| 651 | pkt_len -= overlap; |
| 652 | rcvpartdupbyte += overlap; |
| 653 | TCP_REASS_COUNTER_INCR(&tcp_reass_overlapfront); |
| 654 | rcvoobyte -= overlap; |
| 655 | } |
| 656 | /* |
| 657 | * If the received segment immediates precedes this |
| 658 | * fragment then tack the fragment onto this segment |
| 659 | * and reinsert the data. |
| 660 | */ |
| 661 | if (q->ipqe_seq == pkt_seq + pkt_len) { |
| 662 | #ifdef TCPREASS_DEBUG |
| 663 | printf("tcp_reass[%p]: append %u:%u(%u) to %u:%u(%u)\n" , |
| 664 | tp, q->ipqe_seq, q->ipqe_seq + q->ipqe_len, q->ipqe_len, |
| 665 | pkt_seq, pkt_seq + pkt_len, pkt_len); |
| 666 | #endif |
| 667 | pkt_len += q->ipqe_len; |
| 668 | pkt_flags |= q->ipqe_flags; |
| 669 | m_cat(m, q->ipqe_m); |
| 670 | TAILQ_REMOVE(&tp->segq, q, ipqe_q); |
| 671 | TAILQ_REMOVE(&tp->timeq, q, ipqe_timeq); |
| 672 | tp->t_segqlen--; |
| 673 | KASSERT(tp->t_segqlen >= 0); |
| 674 | KASSERT(tp->t_segqlen != 0 || |
| 675 | (TAILQ_EMPTY(&tp->segq) && |
| 676 | TAILQ_EMPTY(&tp->timeq))); |
| 677 | if (tiqe == NULL) { |
| 678 | tiqe = q; |
| 679 | } else { |
| 680 | tcpipqent_free(q); |
| 681 | } |
| 682 | TCP_REASS_COUNTER_INCR(&tcp_reass_prepend); |
| 683 | break; |
| 684 | } |
| 685 | /* |
| 686 | * If the fragment is before the segment, remember it. |
| 687 | * When this loop is terminated, p will contain the |
| 688 | * pointer to fragment that is right before the received |
| 689 | * segment. |
| 690 | */ |
| 691 | if (SEQ_LEQ(q->ipqe_seq, pkt_seq)) |
| 692 | p = q; |
| 693 | |
| 694 | continue; |
| 695 | |
| 696 | /* |
| 697 | * This is a common operation. It also will allow |
| 698 | * to save doing a malloc/free in most instances. |
| 699 | */ |
| 700 | free_ipqe: |
| 701 | TAILQ_REMOVE(&tp->segq, q, ipqe_q); |
| 702 | TAILQ_REMOVE(&tp->timeq, q, ipqe_timeq); |
| 703 | tp->t_segqlen--; |
| 704 | KASSERT(tp->t_segqlen >= 0); |
| 705 | KASSERT(tp->t_segqlen != 0 || |
| 706 | (TAILQ_EMPTY(&tp->segq) && TAILQ_EMPTY(&tp->timeq))); |
| 707 | if (tiqe == NULL) { |
| 708 | tiqe = q; |
| 709 | } else { |
| 710 | tcpipqent_free(q); |
| 711 | } |
| 712 | } |
| 713 | |
| 714 | #ifdef TCP_REASS_COUNTERS |
| 715 | if (count > 7) |
| 716 | TCP_REASS_COUNTER_INCR(&tcp_reass_iteration[0]); |
| 717 | else if (count > 0) |
| 718 | TCP_REASS_COUNTER_INCR(&tcp_reass_iteration[count]); |
| 719 | #endif |
| 720 | |
| 721 | insert_it: |
| 722 | |
| 723 | /* |
| 724 | * Allocate a new queue entry since the received segment did not |
| 725 | * collapse onto any other out-of-order block; thus we are allocating |
| 726 | * a new block. If it had collapsed, tiqe would not be NULL and |
| 727 | * we would be reusing it. |
| 728 | * XXX If we can't, just drop the packet. XXX |
| 729 | */ |
| 730 | if (tiqe == NULL) { |
| 731 | tiqe = tcpipqent_alloc(); |
| 732 | if (tiqe == NULL) { |
| 733 | TCP_STATINC(TCP_STAT_RCVMEMDROP); |
| 734 | m_freem(m); |
| 735 | goto out; |
| 736 | } |
| 737 | } |
| 738 | |
| 739 | /* |
| 740 | * Update the counters. |
| 741 | */ |
| 742 | tp->t_rcvoopack++; |
| 743 | tcps = TCP_STAT_GETREF(); |
| 744 | tcps[TCP_STAT_RCVOOPACK]++; |
| 745 | tcps[TCP_STAT_RCVOOBYTE] += rcvoobyte; |
| 746 | if (rcvpartdupbyte) { |
| 747 | tcps[TCP_STAT_RCVPARTDUPPACK]++; |
| 748 | tcps[TCP_STAT_RCVPARTDUPBYTE] += rcvpartdupbyte; |
| 749 | } |
| 750 | TCP_STAT_PUTREF(); |
| 751 | |
| 752 | /* |
| 753 | * Insert the new fragment queue entry into both queues. |
| 754 | */ |
| 755 | tiqe->ipqe_m = m; |
| 756 | tiqe->ipre_mlast = m; |
| 757 | tiqe->ipqe_seq = pkt_seq; |
| 758 | tiqe->ipqe_len = pkt_len; |
| 759 | tiqe->ipqe_flags = pkt_flags; |
| 760 | if (p == NULL) { |
| 761 | TAILQ_INSERT_HEAD(&tp->segq, tiqe, ipqe_q); |
| 762 | #ifdef TCPREASS_DEBUG |
| 763 | if (tiqe->ipqe_seq != tp->rcv_nxt) |
| 764 | printf("tcp_reass[%p]: insert %u:%u(%u) at front\n" , |
| 765 | tp, pkt_seq, pkt_seq + pkt_len, pkt_len); |
| 766 | #endif |
| 767 | } else { |
| 768 | TAILQ_INSERT_AFTER(&tp->segq, p, tiqe, ipqe_q); |
| 769 | #ifdef TCPREASS_DEBUG |
| 770 | printf("tcp_reass[%p]: insert %u:%u(%u) after %u:%u(%u)\n" , |
| 771 | tp, pkt_seq, pkt_seq + pkt_len, pkt_len, |
| 772 | p->ipqe_seq, p->ipqe_seq + p->ipqe_len, p->ipqe_len); |
| 773 | #endif |
| 774 | } |
| 775 | tp->t_segqlen++; |
| 776 | |
| 777 | skip_replacement: |
| 778 | |
| 779 | TAILQ_INSERT_HEAD(&tp->timeq, tiqe, ipqe_timeq); |
| 780 | |
| 781 | present: |
| 782 | /* |
| 783 | * Present data to user, advancing rcv_nxt through |
| 784 | * completed sequence space. |
| 785 | */ |
| 786 | if (TCPS_HAVEESTABLISHED(tp->t_state) == 0) |
| 787 | goto out; |
| 788 | q = TAILQ_FIRST(&tp->segq); |
| 789 | if (q == NULL || q->ipqe_seq != tp->rcv_nxt) |
| 790 | goto out; |
| 791 | if (tp->t_state == TCPS_SYN_RECEIVED && q->ipqe_len) |
| 792 | goto out; |
| 793 | |
| 794 | tp->rcv_nxt += q->ipqe_len; |
| 795 | pkt_flags = q->ipqe_flags & TH_FIN; |
| 796 | nd6_hint(tp); |
| 797 | |
| 798 | TAILQ_REMOVE(&tp->segq, q, ipqe_q); |
| 799 | TAILQ_REMOVE(&tp->timeq, q, ipqe_timeq); |
| 800 | tp->t_segqlen--; |
| 801 | KASSERT(tp->t_segqlen >= 0); |
| 802 | KASSERT(tp->t_segqlen != 0 || |
| 803 | (TAILQ_EMPTY(&tp->segq) && TAILQ_EMPTY(&tp->timeq))); |
| 804 | if (so->so_state & SS_CANTRCVMORE) |
| 805 | m_freem(q->ipqe_m); |
| 806 | else |
| 807 | sbappendstream(&so->so_rcv, q->ipqe_m); |
| 808 | tcpipqent_free(q); |
| 809 | TCP_REASS_UNLOCK(tp); |
| 810 | sorwakeup(so); |
| 811 | return (pkt_flags); |
| 812 | out: |
| 813 | TCP_REASS_UNLOCK(tp); |
| 814 | return (0); |
| 815 | } |
| 816 | |
| 817 | #ifdef INET6 |
| 818 | int |
| 819 | tcp6_input(struct mbuf **mp, int *offp, int proto) |
| 820 | { |
| 821 | struct mbuf *m = *mp; |
| 822 | |
| 823 | /* |
| 824 | * draft-itojun-ipv6-tcp-to-anycast |
| 825 | * better place to put this in? |
| 826 | */ |
| 827 | if (m->m_flags & M_ANYCAST6) { |
| 828 | struct ip6_hdr *ip6; |
| 829 | if (m->m_len < sizeof(struct ip6_hdr)) { |
| 830 | if ((m = m_pullup(m, sizeof(struct ip6_hdr))) == NULL) { |
| 831 | TCP_STATINC(TCP_STAT_RCVSHORT); |
| 832 | return IPPROTO_DONE; |
| 833 | } |
| 834 | } |
| 835 | ip6 = mtod(m, struct ip6_hdr *); |
| 836 | icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR, |
| 837 | (char *)&ip6->ip6_dst - (char *)ip6); |
| 838 | return IPPROTO_DONE; |
| 839 | } |
| 840 | |
| 841 | tcp_input(m, *offp, proto); |
| 842 | return IPPROTO_DONE; |
| 843 | } |
| 844 | #endif |
| 845 | |
| 846 | #ifdef INET |
| 847 | static void |
| 848 | tcp4_log_refused(const struct ip *ip, const struct tcphdr *th) |
| 849 | { |
| 850 | char src[INET_ADDRSTRLEN]; |
| 851 | char dst[INET_ADDRSTRLEN]; |
| 852 | |
| 853 | if (ip) { |
| 854 | in_print(src, sizeof(src), &ip->ip_src); |
| 855 | in_print(dst, sizeof(dst), &ip->ip_dst); |
| 856 | } |
| 857 | else { |
| 858 | strlcpy(src, "(unknown)" , sizeof(src)); |
| 859 | strlcpy(dst, "(unknown)" , sizeof(dst)); |
| 860 | } |
| 861 | log(LOG_INFO, |
| 862 | "Connection attempt to TCP %s:%d from %s:%d\n" , |
| 863 | dst, ntohs(th->th_dport), |
| 864 | src, ntohs(th->th_sport)); |
| 865 | } |
| 866 | #endif |
| 867 | |
| 868 | #ifdef INET6 |
| 869 | static void |
| 870 | tcp6_log_refused(const struct ip6_hdr *ip6, const struct tcphdr *th) |
| 871 | { |
| 872 | char src[INET6_ADDRSTRLEN]; |
| 873 | char dst[INET6_ADDRSTRLEN]; |
| 874 | |
| 875 | if (ip6) { |
| 876 | in6_print(src, sizeof(src), &ip6->ip6_src); |
| 877 | in6_print(dst, sizeof(dst), &ip6->ip6_dst); |
| 878 | } |
| 879 | else { |
| 880 | strlcpy(src, "(unknown v6)" , sizeof(src)); |
| 881 | strlcpy(dst, "(unknown v6)" , sizeof(dst)); |
| 882 | } |
| 883 | log(LOG_INFO, |
| 884 | "Connection attempt to TCP [%s]:%d from [%s]:%d\n" , |
| 885 | dst, ntohs(th->th_dport), |
| 886 | src, ntohs(th->th_sport)); |
| 887 | } |
| 888 | #endif |
| 889 | |
| 890 | /* |
| 891 | * Checksum extended TCP header and data. |
| 892 | */ |
| 893 | int |
| 894 | tcp_input_checksum(int af, struct mbuf *m, const struct tcphdr *th, |
| 895 | int toff, int off, int tlen) |
| 896 | { |
| 897 | struct ifnet *rcvif; |
| 898 | int s; |
| 899 | |
| 900 | /* |
| 901 | * XXX it's better to record and check if this mbuf is |
| 902 | * already checked. |
| 903 | */ |
| 904 | |
| 905 | rcvif = m_get_rcvif(m, &s); |
| 906 | |
| 907 | switch (af) { |
| 908 | #ifdef INET |
| 909 | case AF_INET: |
| 910 | switch (m->m_pkthdr.csum_flags & |
| 911 | ((rcvif->if_csum_flags_rx & M_CSUM_TCPv4) | |
| 912 | M_CSUM_TCP_UDP_BAD | M_CSUM_DATA)) { |
| 913 | case M_CSUM_TCPv4|M_CSUM_TCP_UDP_BAD: |
| 914 | TCP_CSUM_COUNTER_INCR(&tcp_hwcsum_bad); |
| 915 | goto badcsum; |
| 916 | |
| 917 | case M_CSUM_TCPv4|M_CSUM_DATA: { |
| 918 | u_int32_t hw_csum = m->m_pkthdr.csum_data; |
| 919 | |
| 920 | TCP_CSUM_COUNTER_INCR(&tcp_hwcsum_data); |
| 921 | if (m->m_pkthdr.csum_flags & M_CSUM_NO_PSEUDOHDR) { |
| 922 | const struct ip *ip = |
| 923 | mtod(m, const struct ip *); |
| 924 | |
| 925 | hw_csum = in_cksum_phdr(ip->ip_src.s_addr, |
| 926 | ip->ip_dst.s_addr, |
| 927 | htons(hw_csum + tlen + off + IPPROTO_TCP)); |
| 928 | } |
| 929 | if ((hw_csum ^ 0xffff) != 0) |
| 930 | goto badcsum; |
| 931 | break; |
| 932 | } |
| 933 | |
| 934 | case M_CSUM_TCPv4: |
| 935 | /* Checksum was okay. */ |
| 936 | TCP_CSUM_COUNTER_INCR(&tcp_hwcsum_ok); |
| 937 | break; |
| 938 | |
| 939 | default: |
| 940 | /* |
| 941 | * Must compute it ourselves. Maybe skip checksum |
| 942 | * on loopback interfaces. |
| 943 | */ |
| 944 | if (__predict_true(!(rcvif->if_flags & IFF_LOOPBACK) || |
| 945 | tcp_do_loopback_cksum)) { |
| 946 | TCP_CSUM_COUNTER_INCR(&tcp_swcsum); |
| 947 | if (in4_cksum(m, IPPROTO_TCP, toff, |
| 948 | tlen + off) != 0) |
| 949 | goto badcsum; |
| 950 | } |
| 951 | break; |
| 952 | } |
| 953 | break; |
| 954 | #endif /* INET4 */ |
| 955 | |
| 956 | #ifdef INET6 |
| 957 | case AF_INET6: |
| 958 | switch (m->m_pkthdr.csum_flags & |
| 959 | ((rcvif->if_csum_flags_rx & M_CSUM_TCPv6) | |
| 960 | M_CSUM_TCP_UDP_BAD | M_CSUM_DATA)) { |
| 961 | case M_CSUM_TCPv6|M_CSUM_TCP_UDP_BAD: |
| 962 | TCP_CSUM_COUNTER_INCR(&tcp6_hwcsum_bad); |
| 963 | goto badcsum; |
| 964 | |
| 965 | #if 0 /* notyet */ |
| 966 | case M_CSUM_TCPv6|M_CSUM_DATA: |
| 967 | #endif |
| 968 | |
| 969 | case M_CSUM_TCPv6: |
| 970 | /* Checksum was okay. */ |
| 971 | TCP_CSUM_COUNTER_INCR(&tcp6_hwcsum_ok); |
| 972 | break; |
| 973 | |
| 974 | default: |
| 975 | /* |
| 976 | * Must compute it ourselves. Maybe skip checksum |
| 977 | * on loopback interfaces. |
| 978 | */ |
| 979 | if (__predict_true((m->m_flags & M_LOOP) == 0 || |
| 980 | tcp_do_loopback_cksum)) { |
| 981 | TCP_CSUM_COUNTER_INCR(&tcp6_swcsum); |
| 982 | if (in6_cksum(m, IPPROTO_TCP, toff, |
| 983 | tlen + off) != 0) |
| 984 | goto badcsum; |
| 985 | } |
| 986 | } |
| 987 | break; |
| 988 | #endif /* INET6 */ |
| 989 | } |
| 990 | m_put_rcvif(rcvif, &s); |
| 991 | |
| 992 | return 0; |
| 993 | |
| 994 | badcsum: |
| 995 | m_put_rcvif(rcvif, &s); |
| 996 | TCP_STATINC(TCP_STAT_RCVBADSUM); |
| 997 | return -1; |
| 998 | } |
| 999 | |
| 1000 | /* When a packet arrives addressed to a vestigial tcpbp, we |
| 1001 | * nevertheless have to respond to it per the spec. |
| 1002 | */ |
| 1003 | static void tcp_vtw_input(struct tcphdr *th, vestigial_inpcb_t *vp, |
| 1004 | struct mbuf *m, int tlen, int multicast) |
| 1005 | { |
| 1006 | int tiflags; |
| 1007 | int todrop; |
| 1008 | uint32_t t_flags = 0; |
| 1009 | uint64_t *tcps; |
| 1010 | |
| 1011 | tiflags = th->th_flags; |
| 1012 | todrop = vp->rcv_nxt - th->th_seq; |
| 1013 | |
| 1014 | if (todrop > 0) { |
| 1015 | if (tiflags & TH_SYN) { |
| 1016 | tiflags &= ~TH_SYN; |
| 1017 | ++th->th_seq; |
| 1018 | if (th->th_urp > 1) |
| 1019 | --th->th_urp; |
| 1020 | else { |
| 1021 | tiflags &= ~TH_URG; |
| 1022 | th->th_urp = 0; |
| 1023 | } |
| 1024 | --todrop; |
| 1025 | } |
| 1026 | if (todrop > tlen || |
| 1027 | (todrop == tlen && (tiflags & TH_FIN) == 0)) { |
| 1028 | /* |
| 1029 | * Any valid FIN or RST must be to the left of the |
| 1030 | * window. At this point the FIN or RST must be a |
| 1031 | * duplicate or out of sequence; drop it. |
| 1032 | */ |
| 1033 | if (tiflags & TH_RST) |
| 1034 | goto drop; |
| 1035 | tiflags &= ~(TH_FIN|TH_RST); |
| 1036 | /* |
| 1037 | * Send an ACK to resynchronize and drop any data. |
| 1038 | * But keep on processing for RST or ACK. |
| 1039 | */ |
| 1040 | t_flags |= TF_ACKNOW; |
| 1041 | todrop = tlen; |
| 1042 | tcps = TCP_STAT_GETREF(); |
| 1043 | tcps[TCP_STAT_RCVDUPPACK] += 1; |
| 1044 | tcps[TCP_STAT_RCVDUPBYTE] += todrop; |
| 1045 | TCP_STAT_PUTREF(); |
| 1046 | } else if ((tiflags & TH_RST) |
| 1047 | && th->th_seq != vp->rcv_nxt) { |
| 1048 | /* |
| 1049 | * Test for reset before adjusting the sequence |
| 1050 | * number for overlapping data. |
| 1051 | */ |
| 1052 | goto dropafterack_ratelim; |
| 1053 | } else { |
| 1054 | tcps = TCP_STAT_GETREF(); |
| 1055 | tcps[TCP_STAT_RCVPARTDUPPACK] += 1; |
| 1056 | tcps[TCP_STAT_RCVPARTDUPBYTE] += todrop; |
| 1057 | TCP_STAT_PUTREF(); |
| 1058 | } |
| 1059 | |
| 1060 | // tcp_new_dsack(tp, th->th_seq, todrop); |
| 1061 | // hdroptlen += todrop; /*drop from head afterwards*/ |
| 1062 | |
| 1063 | th->th_seq += todrop; |
| 1064 | tlen -= todrop; |
| 1065 | |
| 1066 | if (th->th_urp > todrop) |
| 1067 | th->th_urp -= todrop; |
| 1068 | else { |
| 1069 | tiflags &= ~TH_URG; |
| 1070 | th->th_urp = 0; |
| 1071 | } |
| 1072 | } |
| 1073 | |
| 1074 | /* |
| 1075 | * If new data are received on a connection after the |
| 1076 | * user processes are gone, then RST the other end. |
| 1077 | */ |
| 1078 | if (tlen) { |
| 1079 | TCP_STATINC(TCP_STAT_RCVAFTERCLOSE); |
| 1080 | goto dropwithreset; |
| 1081 | } |
| 1082 | |
| 1083 | /* |
| 1084 | * If segment ends after window, drop trailing data |
| 1085 | * (and PUSH and FIN); if nothing left, just ACK. |
| 1086 | */ |
| 1087 | todrop = (th->th_seq + tlen) - (vp->rcv_nxt+vp->rcv_wnd); |
| 1088 | |
| 1089 | if (todrop > 0) { |
| 1090 | TCP_STATINC(TCP_STAT_RCVPACKAFTERWIN); |
| 1091 | if (todrop >= tlen) { |
| 1092 | /* |
| 1093 | * The segment actually starts after the window. |
| 1094 | * th->th_seq + tlen - vp->rcv_nxt - vp->rcv_wnd >= tlen |
| 1095 | * th->th_seq - vp->rcv_nxt - vp->rcv_wnd >= 0 |
| 1096 | * th->th_seq >= vp->rcv_nxt + vp->rcv_wnd |
| 1097 | */ |
| 1098 | TCP_STATADD(TCP_STAT_RCVBYTEAFTERWIN, tlen); |
| 1099 | /* |
| 1100 | * If a new connection request is received |
| 1101 | * while in TIME_WAIT, drop the old connection |
| 1102 | * and start over if the sequence numbers |
| 1103 | * are above the previous ones. |
| 1104 | */ |
| 1105 | if ((tiflags & TH_SYN) |
| 1106 | && SEQ_GT(th->th_seq, vp->rcv_nxt)) { |
| 1107 | /* We only support this in the !NOFDREF case, which |
| 1108 | * is to say: not here. |
| 1109 | */ |
| 1110 | goto dropwithreset; |
| 1111 | } |
| 1112 | /* |
| 1113 | * If window is closed can only take segments at |
| 1114 | * window edge, and have to drop data and PUSH from |
| 1115 | * incoming segments. Continue processing, but |
| 1116 | * remember to ack. Otherwise, drop segment |
| 1117 | * and (if not RST) ack. |
| 1118 | */ |
| 1119 | if (vp->rcv_wnd == 0 && th->th_seq == vp->rcv_nxt) { |
| 1120 | t_flags |= TF_ACKNOW; |
| 1121 | TCP_STATINC(TCP_STAT_RCVWINPROBE); |
| 1122 | } else |
| 1123 | goto dropafterack; |
| 1124 | } else |
| 1125 | TCP_STATADD(TCP_STAT_RCVBYTEAFTERWIN, todrop); |
| 1126 | m_adj(m, -todrop); |
| 1127 | tlen -= todrop; |
| 1128 | tiflags &= ~(TH_PUSH|TH_FIN); |
| 1129 | } |
| 1130 | |
| 1131 | if (tiflags & TH_RST) { |
| 1132 | if (th->th_seq != vp->rcv_nxt) |
| 1133 | goto dropafterack_ratelim; |
| 1134 | |
| 1135 | vtw_del(vp->ctl, vp->vtw); |
| 1136 | goto drop; |
| 1137 | } |
| 1138 | |
| 1139 | /* |
| 1140 | * If the ACK bit is off we drop the segment and return. |
| 1141 | */ |
| 1142 | if ((tiflags & TH_ACK) == 0) { |
| 1143 | if (t_flags & TF_ACKNOW) |
| 1144 | goto dropafterack; |
| 1145 | else |
| 1146 | goto drop; |
| 1147 | } |
| 1148 | |
| 1149 | /* |
| 1150 | * In TIME_WAIT state the only thing that should arrive |
| 1151 | * is a retransmission of the remote FIN. Acknowledge |
| 1152 | * it and restart the finack timer. |
| 1153 | */ |
| 1154 | vtw_restart(vp); |
| 1155 | goto dropafterack; |
| 1156 | |
| 1157 | dropafterack: |
| 1158 | /* |
| 1159 | * Generate an ACK dropping incoming segment if it occupies |
| 1160 | * sequence space, where the ACK reflects our state. |
| 1161 | */ |
| 1162 | if (tiflags & TH_RST) |
| 1163 | goto drop; |
| 1164 | goto dropafterack2; |
| 1165 | |
| 1166 | dropafterack_ratelim: |
| 1167 | /* |
| 1168 | * We may want to rate-limit ACKs against SYN/RST attack. |
| 1169 | */ |
| 1170 | if (ppsratecheck(&tcp_ackdrop_ppslim_last, &tcp_ackdrop_ppslim_count, |
| 1171 | tcp_ackdrop_ppslim) == 0) { |
| 1172 | /* XXX stat */ |
| 1173 | goto drop; |
| 1174 | } |
| 1175 | /* ...fall into dropafterack2... */ |
| 1176 | |
| 1177 | dropafterack2: |
| 1178 | (void)tcp_respond(0, m, m, th, th->th_seq + tlen, th->th_ack, |
| 1179 | TH_ACK); |
| 1180 | return; |
| 1181 | |
| 1182 | dropwithreset: |
| 1183 | /* |
| 1184 | * Generate a RST, dropping incoming segment. |
| 1185 | * Make ACK acceptable to originator of segment. |
| 1186 | */ |
| 1187 | if (tiflags & TH_RST) |
| 1188 | goto drop; |
| 1189 | |
| 1190 | if (tiflags & TH_ACK) |
| 1191 | tcp_respond(0, m, m, th, (tcp_seq)0, th->th_ack, TH_RST); |
| 1192 | else { |
| 1193 | if (tiflags & TH_SYN) |
| 1194 | ++tlen; |
| 1195 | (void)tcp_respond(0, m, m, th, th->th_seq + tlen, (tcp_seq)0, |
| 1196 | TH_RST|TH_ACK); |
| 1197 | } |
| 1198 | return; |
| 1199 | drop: |
| 1200 | m_freem(m); |
| 1201 | } |
| 1202 | |
| 1203 | /* |
| 1204 | * TCP input routine, follows pages 65-76 of RFC 793 very closely. |
| 1205 | */ |
| 1206 | void |
| 1207 | tcp_input(struct mbuf *m, ...) |
| 1208 | { |
| 1209 | struct tcphdr *th; |
| 1210 | struct ip *ip; |
| 1211 | struct inpcb *inp; |
| 1212 | #ifdef INET6 |
| 1213 | struct ip6_hdr *ip6; |
| 1214 | struct in6pcb *in6p; |
| 1215 | #endif |
| 1216 | u_int8_t *optp = NULL; |
| 1217 | int optlen = 0; |
| 1218 | int len, tlen, toff, hdroptlen = 0; |
| 1219 | struct tcpcb *tp = 0; |
| 1220 | int tiflags; |
| 1221 | struct socket *so = NULL; |
| 1222 | int todrop, acked, ourfinisacked, needoutput = 0; |
| 1223 | bool dupseg; |
| 1224 | #ifdef TCP_DEBUG |
| 1225 | short ostate = 0; |
| 1226 | #endif |
| 1227 | u_long tiwin; |
| 1228 | struct tcp_opt_info opti; |
| 1229 | int off, iphlen; |
| 1230 | va_list ap; |
| 1231 | int af; /* af on the wire */ |
| 1232 | struct mbuf *tcp_saveti = NULL; |
| 1233 | uint32_t ts_rtt; |
| 1234 | uint8_t iptos; |
| 1235 | uint64_t *tcps; |
| 1236 | vestigial_inpcb_t vestige; |
| 1237 | |
| 1238 | vestige.valid = 0; |
| 1239 | |
| 1240 | MCLAIM(m, &tcp_rx_mowner); |
| 1241 | va_start(ap, m); |
| 1242 | toff = va_arg(ap, int); |
| 1243 | (void)va_arg(ap, int); /* ignore value, advance ap */ |
| 1244 | va_end(ap); |
| 1245 | |
| 1246 | TCP_STATINC(TCP_STAT_RCVTOTAL); |
| 1247 | |
| 1248 | memset(&opti, 0, sizeof(opti)); |
| 1249 | opti.ts_present = 0; |
| 1250 | opti.maxseg = 0; |
| 1251 | |
| 1252 | /* |
| 1253 | * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN. |
| 1254 | * |
| 1255 | * TCP is, by definition, unicast, so we reject all |
| 1256 | * multicast outright. |
| 1257 | * |
| 1258 | * Note, there are additional src/dst address checks in |
| 1259 | * the AF-specific code below. |
| 1260 | */ |
| 1261 | if (m->m_flags & (M_BCAST|M_MCAST)) { |
| 1262 | /* XXX stat */ |
| 1263 | goto drop; |
| 1264 | } |
| 1265 | #ifdef INET6 |
| 1266 | if (m->m_flags & M_ANYCAST6) { |
| 1267 | /* XXX stat */ |
| 1268 | goto drop; |
| 1269 | } |
| 1270 | #endif |
| 1271 | |
| 1272 | /* |
| 1273 | * Get IP and TCP header. |
| 1274 | * Note: IP leaves IP header in first mbuf. |
| 1275 | */ |
| 1276 | ip = mtod(m, struct ip *); |
| 1277 | switch (ip->ip_v) { |
| 1278 | #ifdef INET |
| 1279 | case 4: |
| 1280 | #ifdef INET6 |
| 1281 | ip6 = NULL; |
| 1282 | #endif |
| 1283 | af = AF_INET; |
| 1284 | iphlen = sizeof(struct ip); |
| 1285 | IP6_EXTHDR_GET(th, struct tcphdr *, m, toff, |
| 1286 | sizeof(struct tcphdr)); |
| 1287 | if (th == NULL) { |
| 1288 | TCP_STATINC(TCP_STAT_RCVSHORT); |
| 1289 | return; |
| 1290 | } |
| 1291 | /* We do the checksum after PCB lookup... */ |
| 1292 | len = ntohs(ip->ip_len); |
| 1293 | tlen = len - toff; |
| 1294 | iptos = ip->ip_tos; |
| 1295 | break; |
| 1296 | #endif |
| 1297 | #ifdef INET6 |
| 1298 | case 6: |
| 1299 | ip = NULL; |
| 1300 | iphlen = sizeof(struct ip6_hdr); |
| 1301 | af = AF_INET6; |
| 1302 | ip6 = mtod(m, struct ip6_hdr *); |
| 1303 | IP6_EXTHDR_GET(th, struct tcphdr *, m, toff, |
| 1304 | sizeof(struct tcphdr)); |
| 1305 | if (th == NULL) { |
| 1306 | TCP_STATINC(TCP_STAT_RCVSHORT); |
| 1307 | return; |
| 1308 | } |
| 1309 | |
| 1310 | /* Be proactive about malicious use of IPv4 mapped address */ |
| 1311 | if (IN6_IS_ADDR_V4MAPPED(&ip6->ip6_src) || |
| 1312 | IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst)) { |
| 1313 | /* XXX stat */ |
| 1314 | goto drop; |
| 1315 | } |
| 1316 | |
| 1317 | /* |
| 1318 | * Be proactive about unspecified IPv6 address in source. |
| 1319 | * As we use all-zero to indicate unbounded/unconnected pcb, |
| 1320 | * unspecified IPv6 address can be used to confuse us. |
| 1321 | * |
| 1322 | * Note that packets with unspecified IPv6 destination is |
| 1323 | * already dropped in ip6_input. |
| 1324 | */ |
| 1325 | if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { |
| 1326 | /* XXX stat */ |
| 1327 | goto drop; |
| 1328 | } |
| 1329 | |
| 1330 | /* |
| 1331 | * Make sure destination address is not multicast. |
| 1332 | * Source address checked in ip6_input(). |
| 1333 | */ |
| 1334 | if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { |
| 1335 | /* XXX stat */ |
| 1336 | goto drop; |
| 1337 | } |
| 1338 | |
| 1339 | /* We do the checksum after PCB lookup... */ |
| 1340 | len = m->m_pkthdr.len; |
| 1341 | tlen = len - toff; |
| 1342 | iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; |
| 1343 | break; |
| 1344 | #endif |
| 1345 | default: |
| 1346 | m_freem(m); |
| 1347 | return; |
| 1348 | } |
| 1349 | /* |
| 1350 | * Enforce alignment requirements that are violated in |
| 1351 | * some cases, see kern/50766 for details. |
| 1352 | */ |
| 1353 | if (TCP_HDR_ALIGNED_P(th) == 0) { |
| 1354 | m = m_copyup(m, toff + sizeof(struct tcphdr), 0); |
| 1355 | if (m == NULL) { |
| 1356 | TCP_STATINC(TCP_STAT_RCVSHORT); |
| 1357 | return; |
| 1358 | } |
| 1359 | ip = mtod(m, struct ip *); |
| 1360 | #ifdef INET6 |
| 1361 | ip6 = mtod(m, struct ip6_hdr *); |
| 1362 | #endif |
| 1363 | th = (struct tcphdr *)(mtod(m, char *) + toff); |
| 1364 | } |
| 1365 | KASSERT(TCP_HDR_ALIGNED_P(th)); |
| 1366 | |
| 1367 | /* |
| 1368 | * Check that TCP offset makes sense, |
| 1369 | * pull out TCP options and adjust length. XXX |
| 1370 | */ |
| 1371 | off = th->th_off << 2; |
| 1372 | if (off < sizeof (struct tcphdr) || off > tlen) { |
| 1373 | TCP_STATINC(TCP_STAT_RCVBADOFF); |
| 1374 | goto drop; |
| 1375 | } |
| 1376 | tlen -= off; |
| 1377 | |
| 1378 | /* |
| 1379 | * tcp_input() has been modified to use tlen to mean the TCP data |
| 1380 | * length throughout the function. Other functions can use |
| 1381 | * m->m_pkthdr.len as the basis for calculating the TCP data length. |
| 1382 | * rja |
| 1383 | */ |
| 1384 | |
| 1385 | if (off > sizeof (struct tcphdr)) { |
| 1386 | IP6_EXTHDR_GET(th, struct tcphdr *, m, toff, off); |
| 1387 | if (th == NULL) { |
| 1388 | TCP_STATINC(TCP_STAT_RCVSHORT); |
| 1389 | return; |
| 1390 | } |
| 1391 | /* |
| 1392 | * NOTE: ip/ip6 will not be affected by m_pulldown() |
| 1393 | * (as they're before toff) and we don't need to update those. |
| 1394 | */ |
| 1395 | KASSERT(TCP_HDR_ALIGNED_P(th)); |
| 1396 | optlen = off - sizeof (struct tcphdr); |
| 1397 | optp = ((u_int8_t *)th) + sizeof(struct tcphdr); |
| 1398 | /* |
| 1399 | * Do quick retrieval of timestamp options ("options |
| 1400 | * prediction?"). If timestamp is the only option and it's |
| 1401 | * formatted as recommended in RFC 1323 appendix A, we |
| 1402 | * quickly get the values now and not bother calling |
| 1403 | * tcp_dooptions(), etc. |
| 1404 | */ |
| 1405 | if ((optlen == TCPOLEN_TSTAMP_APPA || |
| 1406 | (optlen > TCPOLEN_TSTAMP_APPA && |
| 1407 | optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) && |
| 1408 | *(u_int32_t *)optp == htonl(TCPOPT_TSTAMP_HDR) && |
| 1409 | (th->th_flags & TH_SYN) == 0) { |
| 1410 | opti.ts_present = 1; |
| 1411 | opti.ts_val = ntohl(*(u_int32_t *)(optp + 4)); |
| 1412 | opti.ts_ecr = ntohl(*(u_int32_t *)(optp + 8)); |
| 1413 | optp = NULL; /* we've parsed the options */ |
| 1414 | } |
| 1415 | } |
| 1416 | tiflags = th->th_flags; |
| 1417 | |
| 1418 | /* |
| 1419 | * Checksum extended TCP header and data |
| 1420 | */ |
| 1421 | if (tcp_input_checksum(af, m, th, toff, off, tlen)) |
| 1422 | goto badcsum; |
| 1423 | |
| 1424 | /* |
| 1425 | * Locate pcb for segment. |
| 1426 | */ |
| 1427 | findpcb: |
| 1428 | inp = NULL; |
| 1429 | #ifdef INET6 |
| 1430 | in6p = NULL; |
| 1431 | #endif |
| 1432 | switch (af) { |
| 1433 | #ifdef INET |
| 1434 | case AF_INET: |
| 1435 | inp = in_pcblookup_connect(&tcbtable, ip->ip_src, th->th_sport, |
| 1436 | ip->ip_dst, th->th_dport, |
| 1437 | &vestige); |
| 1438 | if (inp == 0 && !vestige.valid) { |
| 1439 | TCP_STATINC(TCP_STAT_PCBHASHMISS); |
| 1440 | inp = in_pcblookup_bind(&tcbtable, ip->ip_dst, th->th_dport); |
| 1441 | } |
| 1442 | #ifdef INET6 |
| 1443 | if (inp == 0 && !vestige.valid) { |
| 1444 | struct in6_addr s, d; |
| 1445 | |
| 1446 | /* mapped addr case */ |
| 1447 | in6_in_2_v4mapin6(&ip->ip_src, &s); |
| 1448 | in6_in_2_v4mapin6(&ip->ip_dst, &d); |
| 1449 | in6p = in6_pcblookup_connect(&tcbtable, &s, |
| 1450 | th->th_sport, &d, th->th_dport, |
| 1451 | 0, &vestige); |
| 1452 | if (in6p == 0 && !vestige.valid) { |
| 1453 | TCP_STATINC(TCP_STAT_PCBHASHMISS); |
| 1454 | in6p = in6_pcblookup_bind(&tcbtable, &d, |
| 1455 | th->th_dport, 0); |
| 1456 | } |
| 1457 | } |
| 1458 | #endif |
| 1459 | #ifndef INET6 |
| 1460 | if (inp == 0 && !vestige.valid) |
| 1461 | #else |
| 1462 | if (inp == 0 && in6p == 0 && !vestige.valid) |
| 1463 | #endif |
| 1464 | { |
| 1465 | TCP_STATINC(TCP_STAT_NOPORT); |
| 1466 | if (tcp_log_refused && |
| 1467 | (tiflags & (TH_RST|TH_ACK|TH_SYN)) == TH_SYN) { |
| 1468 | tcp4_log_refused(ip, th); |
| 1469 | } |
| 1470 | tcp_fields_to_host(th); |
| 1471 | goto dropwithreset_ratelim; |
| 1472 | } |
| 1473 | #if defined(IPSEC) |
| 1474 | if (ipsec_used) { |
| 1475 | if (inp && |
| 1476 | (inp->inp_socket->so_options & SO_ACCEPTCONN) == 0 |
| 1477 | && ipsec4_in_reject(m, inp)) { |
| 1478 | IPSEC_STATINC(IPSEC_STAT_IN_POLVIO); |
| 1479 | goto drop; |
| 1480 | } |
| 1481 | #ifdef INET6 |
| 1482 | else if (in6p && |
| 1483 | (in6p->in6p_socket->so_options & SO_ACCEPTCONN) == 0 |
| 1484 | && ipsec6_in_reject_so(m, in6p->in6p_socket)) { |
| 1485 | IPSEC_STATINC(IPSEC_STAT_IN_POLVIO); |
| 1486 | goto drop; |
| 1487 | } |
| 1488 | #endif |
| 1489 | } |
| 1490 | #endif /*IPSEC*/ |
| 1491 | break; |
| 1492 | #endif /*INET*/ |
| 1493 | #ifdef INET6 |
| 1494 | case AF_INET6: |
| 1495 | { |
| 1496 | int faith; |
| 1497 | |
| 1498 | #if defined(NFAITH) && NFAITH > 0 |
| 1499 | faith = faithprefix(&ip6->ip6_dst); |
| 1500 | #else |
| 1501 | faith = 0; |
| 1502 | #endif |
| 1503 | in6p = in6_pcblookup_connect(&tcbtable, &ip6->ip6_src, |
| 1504 | th->th_sport, &ip6->ip6_dst, th->th_dport, faith, &vestige); |
| 1505 | if (!in6p && !vestige.valid) { |
| 1506 | TCP_STATINC(TCP_STAT_PCBHASHMISS); |
| 1507 | in6p = in6_pcblookup_bind(&tcbtable, &ip6->ip6_dst, |
| 1508 | th->th_dport, faith); |
| 1509 | } |
| 1510 | if (!in6p && !vestige.valid) { |
| 1511 | TCP_STATINC(TCP_STAT_NOPORT); |
| 1512 | if (tcp_log_refused && |
| 1513 | (tiflags & (TH_RST|TH_ACK|TH_SYN)) == TH_SYN) { |
| 1514 | tcp6_log_refused(ip6, th); |
| 1515 | } |
| 1516 | tcp_fields_to_host(th); |
| 1517 | goto dropwithreset_ratelim; |
| 1518 | } |
| 1519 | #if defined(IPSEC) |
| 1520 | if (ipsec_used && in6p |
| 1521 | && (in6p->in6p_socket->so_options & SO_ACCEPTCONN) == 0 |
| 1522 | && ipsec6_in_reject(m, in6p)) { |
| 1523 | IPSEC6_STATINC(IPSEC_STAT_IN_POLVIO); |
| 1524 | goto drop; |
| 1525 | } |
| 1526 | #endif /*IPSEC*/ |
| 1527 | break; |
| 1528 | } |
| 1529 | #endif |
| 1530 | } |
| 1531 | |
| 1532 | /* |
| 1533 | * If the state is CLOSED (i.e., TCB does not exist) then |
| 1534 | * all data in the incoming segment is discarded. |
| 1535 | * If the TCB exists but is in CLOSED state, it is embryonic, |
| 1536 | * but should either do a listen or a connect soon. |
| 1537 | */ |
| 1538 | tp = NULL; |
| 1539 | so = NULL; |
| 1540 | if (inp) { |
| 1541 | /* Check the minimum TTL for socket. */ |
| 1542 | if (ip->ip_ttl < inp->inp_ip_minttl) |
| 1543 | goto drop; |
| 1544 | |
| 1545 | tp = intotcpcb(inp); |
| 1546 | so = inp->inp_socket; |
| 1547 | } |
| 1548 | #ifdef INET6 |
| 1549 | else if (in6p) { |
| 1550 | tp = in6totcpcb(in6p); |
| 1551 | so = in6p->in6p_socket; |
| 1552 | } |
| 1553 | #endif |
| 1554 | else if (vestige.valid) { |
| 1555 | int mc = 0; |
| 1556 | |
| 1557 | /* We do not support the resurrection of vtw tcpcps. |
| 1558 | */ |
| 1559 | if (tcp_input_checksum(af, m, th, toff, off, tlen)) |
| 1560 | goto badcsum; |
| 1561 | |
| 1562 | switch (af) { |
| 1563 | #ifdef INET6 |
| 1564 | case AF_INET6: |
| 1565 | mc = IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst); |
| 1566 | break; |
| 1567 | #endif |
| 1568 | |
| 1569 | case AF_INET: |
| 1570 | mc = (IN_MULTICAST(ip->ip_dst.s_addr) |
| 1571 | || in_broadcast(ip->ip_dst, |
| 1572 | m_get_rcvif_NOMPSAFE(m))); |
| 1573 | break; |
| 1574 | } |
| 1575 | |
| 1576 | tcp_fields_to_host(th); |
| 1577 | tcp_vtw_input(th, &vestige, m, tlen, mc); |
| 1578 | m = 0; |
| 1579 | goto drop; |
| 1580 | } |
| 1581 | |
| 1582 | if (tp == 0) { |
| 1583 | tcp_fields_to_host(th); |
| 1584 | goto dropwithreset_ratelim; |
| 1585 | } |
| 1586 | if (tp->t_state == TCPS_CLOSED) |
| 1587 | goto drop; |
| 1588 | |
| 1589 | KASSERT(so->so_lock == softnet_lock); |
| 1590 | KASSERT(solocked(so)); |
| 1591 | |
| 1592 | tcp_fields_to_host(th); |
| 1593 | |
| 1594 | /* Unscale the window into a 32-bit value. */ |
| 1595 | if ((tiflags & TH_SYN) == 0) |
| 1596 | tiwin = th->th_win << tp->snd_scale; |
| 1597 | else |
| 1598 | tiwin = th->th_win; |
| 1599 | |
| 1600 | #ifdef INET6 |
| 1601 | /* save packet options if user wanted */ |
| 1602 | if (in6p && (in6p->in6p_flags & IN6P_CONTROLOPTS)) { |
| 1603 | if (in6p->in6p_options) { |
| 1604 | m_freem(in6p->in6p_options); |
| 1605 | in6p->in6p_options = 0; |
| 1606 | } |
| 1607 | KASSERT(ip6 != NULL); |
| 1608 | ip6_savecontrol(in6p, &in6p->in6p_options, ip6, m); |
| 1609 | } |
| 1610 | #endif |
| 1611 | |
| 1612 | if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) { |
| 1613 | union syn_cache_sa src; |
| 1614 | union syn_cache_sa dst; |
| 1615 | |
| 1616 | memset(&src, 0, sizeof(src)); |
| 1617 | memset(&dst, 0, sizeof(dst)); |
| 1618 | switch (af) { |
| 1619 | #ifdef INET |
| 1620 | case AF_INET: |
| 1621 | src.sin.sin_len = sizeof(struct sockaddr_in); |
| 1622 | src.sin.sin_family = AF_INET; |
| 1623 | src.sin.sin_addr = ip->ip_src; |
| 1624 | src.sin.sin_port = th->th_sport; |
| 1625 | |
| 1626 | dst.sin.sin_len = sizeof(struct sockaddr_in); |
| 1627 | dst.sin.sin_family = AF_INET; |
| 1628 | dst.sin.sin_addr = ip->ip_dst; |
| 1629 | dst.sin.sin_port = th->th_dport; |
| 1630 | break; |
| 1631 | #endif |
| 1632 | #ifdef INET6 |
| 1633 | case AF_INET6: |
| 1634 | src.sin6.sin6_len = sizeof(struct sockaddr_in6); |
| 1635 | src.sin6.sin6_family = AF_INET6; |
| 1636 | src.sin6.sin6_addr = ip6->ip6_src; |
| 1637 | src.sin6.sin6_port = th->th_sport; |
| 1638 | |
| 1639 | dst.sin6.sin6_len = sizeof(struct sockaddr_in6); |
| 1640 | dst.sin6.sin6_family = AF_INET6; |
| 1641 | dst.sin6.sin6_addr = ip6->ip6_dst; |
| 1642 | dst.sin6.sin6_port = th->th_dport; |
| 1643 | break; |
| 1644 | #endif /* INET6 */ |
| 1645 | default: |
| 1646 | goto badsyn; /*sanity*/ |
| 1647 | } |
| 1648 | |
| 1649 | if (so->so_options & SO_DEBUG) { |
| 1650 | #ifdef TCP_DEBUG |
| 1651 | ostate = tp->t_state; |
| 1652 | #endif |
| 1653 | |
| 1654 | tcp_saveti = NULL; |
| 1655 | if (iphlen + sizeof(struct tcphdr) > MHLEN) |
| 1656 | goto nosave; |
| 1657 | |
| 1658 | if (m->m_len > iphlen && (m->m_flags & M_EXT) == 0) { |
| 1659 | tcp_saveti = m_copym(m, 0, iphlen, M_DONTWAIT); |
| 1660 | if (!tcp_saveti) |
| 1661 | goto nosave; |
| 1662 | } else { |
| 1663 | MGETHDR(tcp_saveti, M_DONTWAIT, MT_HEADER); |
| 1664 | if (!tcp_saveti) |
| 1665 | goto nosave; |
| 1666 | MCLAIM(m, &tcp_mowner); |
| 1667 | tcp_saveti->m_len = iphlen; |
| 1668 | m_copydata(m, 0, iphlen, |
| 1669 | mtod(tcp_saveti, void *)); |
| 1670 | } |
| 1671 | |
| 1672 | if (M_TRAILINGSPACE(tcp_saveti) < sizeof(struct tcphdr)) { |
| 1673 | m_freem(tcp_saveti); |
| 1674 | tcp_saveti = NULL; |
| 1675 | } else { |
| 1676 | tcp_saveti->m_len += sizeof(struct tcphdr); |
| 1677 | memcpy(mtod(tcp_saveti, char *) + iphlen, th, |
| 1678 | sizeof(struct tcphdr)); |
| 1679 | } |
| 1680 | nosave:; |
| 1681 | } |
| 1682 | if (so->so_options & SO_ACCEPTCONN) { |
| 1683 | if ((tiflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) { |
| 1684 | if (tiflags & TH_RST) { |
| 1685 | syn_cache_reset(&src.sa, &dst.sa, th); |
| 1686 | } else if ((tiflags & (TH_ACK|TH_SYN)) == |
| 1687 | (TH_ACK|TH_SYN)) { |
| 1688 | /* |
| 1689 | * Received a SYN,ACK. This should |
| 1690 | * never happen while we are in |
| 1691 | * LISTEN. Send an RST. |
| 1692 | */ |
| 1693 | goto badsyn; |
| 1694 | } else if (tiflags & TH_ACK) { |
| 1695 | so = syn_cache_get(&src.sa, &dst.sa, |
| 1696 | th, toff, tlen, so, m); |
| 1697 | if (so == NULL) { |
| 1698 | /* |
| 1699 | * We don't have a SYN for |
| 1700 | * this ACK; send an RST. |
| 1701 | */ |
| 1702 | goto badsyn; |
| 1703 | } else if (so == |
| 1704 | (struct socket *)(-1)) { |
| 1705 | /* |
| 1706 | * We were unable to create |
| 1707 | * the connection. If the |
| 1708 | * 3-way handshake was |
| 1709 | * completed, and RST has |
| 1710 | * been sent to the peer. |
| 1711 | * Since the mbuf might be |
| 1712 | * in use for the reply, |
| 1713 | * do not free it. |
| 1714 | */ |
| 1715 | m = NULL; |
| 1716 | } else { |
| 1717 | /* |
| 1718 | * We have created a |
| 1719 | * full-blown connection. |
| 1720 | */ |
| 1721 | tp = NULL; |
| 1722 | inp = NULL; |
| 1723 | #ifdef INET6 |
| 1724 | in6p = NULL; |
| 1725 | #endif |
| 1726 | switch (so->so_proto->pr_domain->dom_family) { |
| 1727 | #ifdef INET |
| 1728 | case AF_INET: |
| 1729 | inp = sotoinpcb(so); |
| 1730 | tp = intotcpcb(inp); |
| 1731 | break; |
| 1732 | #endif |
| 1733 | #ifdef INET6 |
| 1734 | case AF_INET6: |
| 1735 | in6p = sotoin6pcb(so); |
| 1736 | tp = in6totcpcb(in6p); |
| 1737 | break; |
| 1738 | #endif |
| 1739 | } |
| 1740 | if (tp == NULL) |
| 1741 | goto badsyn; /*XXX*/ |
| 1742 | tiwin <<= tp->snd_scale; |
| 1743 | goto after_listen; |
| 1744 | } |
| 1745 | } else { |
| 1746 | /* |
| 1747 | * None of RST, SYN or ACK was set. |
| 1748 | * This is an invalid packet for a |
| 1749 | * TCB in LISTEN state. Send a RST. |
| 1750 | */ |
| 1751 | goto badsyn; |
| 1752 | } |
| 1753 | } else { |
| 1754 | /* |
| 1755 | * Received a SYN. |
| 1756 | * |
| 1757 | * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN |
| 1758 | */ |
| 1759 | if (m->m_flags & (M_BCAST|M_MCAST)) |
| 1760 | goto drop; |
| 1761 | |
| 1762 | switch (af) { |
| 1763 | #ifdef INET6 |
| 1764 | case AF_INET6: |
| 1765 | if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) |
| 1766 | goto drop; |
| 1767 | break; |
| 1768 | #endif /* INET6 */ |
| 1769 | case AF_INET: |
| 1770 | if (IN_MULTICAST(ip->ip_dst.s_addr) || |
| 1771 | in_broadcast(ip->ip_dst, |
| 1772 | m_get_rcvif_NOMPSAFE(m))) |
| 1773 | goto drop; |
| 1774 | break; |
| 1775 | } |
| 1776 | |
| 1777 | #ifdef INET6 |
| 1778 | /* |
| 1779 | * If deprecated address is forbidden, we do |
| 1780 | * not accept SYN to deprecated interface |
| 1781 | * address to prevent any new inbound |
| 1782 | * connection from getting established. |
| 1783 | * When we do not accept SYN, we send a TCP |
| 1784 | * RST, with deprecated source address (instead |
| 1785 | * of dropping it). We compromise it as it is |
| 1786 | * much better for peer to send a RST, and |
| 1787 | * RST will be the final packet for the |
| 1788 | * exchange. |
| 1789 | * |
| 1790 | * If we do not forbid deprecated addresses, we |
| 1791 | * accept the SYN packet. RFC2462 does not |
| 1792 | * suggest dropping SYN in this case. |
| 1793 | * If we decipher RFC2462 5.5.4, it says like |
| 1794 | * this: |
| 1795 | * 1. use of deprecated addr with existing |
| 1796 | * communication is okay - "SHOULD continue |
| 1797 | * to be used" |
| 1798 | * 2. use of it with new communication: |
| 1799 | * (2a) "SHOULD NOT be used if alternate |
| 1800 | * address with sufficient scope is |
| 1801 | * available" |
| 1802 | * (2b) nothing mentioned otherwise. |
| 1803 | * Here we fall into (2b) case as we have no |
| 1804 | * choice in our source address selection - we |
| 1805 | * must obey the peer. |
| 1806 | * |
| 1807 | * The wording in RFC2462 is confusing, and |
| 1808 | * there are multiple description text for |
| 1809 | * deprecated address handling - worse, they |
| 1810 | * are not exactly the same. I believe 5.5.4 |
| 1811 | * is the best one, so we follow 5.5.4. |
| 1812 | */ |
| 1813 | if (af == AF_INET6 && !ip6_use_deprecated) { |
| 1814 | struct in6_ifaddr *ia6; |
| 1815 | int s; |
| 1816 | struct ifnet *rcvif = m_get_rcvif(m, &s); |
| 1817 | if (rcvif == NULL) |
| 1818 | goto dropwithreset; /* XXX */ |
| 1819 | if ((ia6 = in6ifa_ifpwithaddr(rcvif, |
| 1820 | &ip6->ip6_dst)) && |
| 1821 | (ia6->ia6_flags & IN6_IFF_DEPRECATED)) { |
| 1822 | tp = NULL; |
| 1823 | m_put_rcvif(rcvif, &s); |
| 1824 | goto dropwithreset; |
| 1825 | } |
| 1826 | m_put_rcvif(rcvif, &s); |
| 1827 | } |
| 1828 | #endif |
| 1829 | |
| 1830 | #if defined(IPSEC) |
| 1831 | if (ipsec_used) { |
| 1832 | switch (af) { |
| 1833 | #ifdef INET |
| 1834 | case AF_INET: |
| 1835 | if (!ipsec4_in_reject_so(m, so)) |
| 1836 | break; |
| 1837 | IPSEC_STATINC( |
| 1838 | IPSEC_STAT_IN_POLVIO); |
| 1839 | tp = NULL; |
| 1840 | goto dropwithreset; |
| 1841 | #endif |
| 1842 | #ifdef INET6 |
| 1843 | case AF_INET6: |
| 1844 | if (!ipsec6_in_reject_so(m, so)) |
| 1845 | break; |
| 1846 | IPSEC6_STATINC( |
| 1847 | IPSEC_STAT_IN_POLVIO); |
| 1848 | tp = NULL; |
| 1849 | goto dropwithreset; |
| 1850 | #endif /*INET6*/ |
| 1851 | } |
| 1852 | } |
| 1853 | #endif /*IPSEC*/ |
| 1854 | |
| 1855 | /* |
| 1856 | * LISTEN socket received a SYN |
| 1857 | * from itself? This can't possibly |
| 1858 | * be valid; drop the packet. |
| 1859 | */ |
| 1860 | if (th->th_sport == th->th_dport) { |
| 1861 | int i; |
| 1862 | |
| 1863 | switch (af) { |
| 1864 | #ifdef INET |
| 1865 | case AF_INET: |
| 1866 | i = in_hosteq(ip->ip_src, ip->ip_dst); |
| 1867 | break; |
| 1868 | #endif |
| 1869 | #ifdef INET6 |
| 1870 | case AF_INET6: |
| 1871 | i = IN6_ARE_ADDR_EQUAL(&ip6->ip6_src, &ip6->ip6_dst); |
| 1872 | break; |
| 1873 | #endif |
| 1874 | default: |
| 1875 | i = 1; |
| 1876 | } |
| 1877 | if (i) { |
| 1878 | TCP_STATINC(TCP_STAT_BADSYN); |
| 1879 | goto drop; |
| 1880 | } |
| 1881 | } |
| 1882 | |
| 1883 | /* |
| 1884 | * SYN looks ok; create compressed TCP |
| 1885 | * state for it. |
| 1886 | */ |
| 1887 | if (so->so_qlen <= so->so_qlimit && |
| 1888 | syn_cache_add(&src.sa, &dst.sa, th, tlen, |
| 1889 | so, m, optp, optlen, &opti)) |
| 1890 | m = NULL; |
| 1891 | } |
| 1892 | goto drop; |
| 1893 | } |
| 1894 | } |
| 1895 | |
| 1896 | after_listen: |
| 1897 | #ifdef DIAGNOSTIC |
| 1898 | /* |
| 1899 | * Should not happen now that all embryonic connections |
| 1900 | * are handled with compressed state. |
| 1901 | */ |
| 1902 | if (tp->t_state == TCPS_LISTEN) |
| 1903 | panic("tcp_input: TCPS_LISTEN" ); |
| 1904 | #endif |
| 1905 | |
| 1906 | /* |
| 1907 | * Segment received on connection. |
| 1908 | * Reset idle time and keep-alive timer. |
| 1909 | */ |
| 1910 | tp->t_rcvtime = tcp_now; |
| 1911 | if (TCPS_HAVEESTABLISHED(tp->t_state)) |
| 1912 | TCP_TIMER_ARM(tp, TCPT_KEEP, tp->t_keepidle); |
| 1913 | |
| 1914 | /* |
| 1915 | * Process options. |
| 1916 | */ |
| 1917 | #ifdef TCP_SIGNATURE |
| 1918 | if (optp || (tp->t_flags & TF_SIGNATURE)) |
| 1919 | #else |
| 1920 | if (optp) |
| 1921 | #endif |
| 1922 | if (tcp_dooptions(tp, optp, optlen, th, m, toff, &opti) < 0) |
| 1923 | goto drop; |
| 1924 | |
| 1925 | if (TCP_SACK_ENABLED(tp)) { |
| 1926 | tcp_del_sackholes(tp, th); |
| 1927 | } |
| 1928 | |
| 1929 | if (TCP_ECN_ALLOWED(tp)) { |
| 1930 | if (tiflags & TH_CWR) { |
| 1931 | tp->t_flags &= ~TF_ECN_SND_ECE; |
| 1932 | } |
| 1933 | switch (iptos & IPTOS_ECN_MASK) { |
| 1934 | case IPTOS_ECN_CE: |
| 1935 | tp->t_flags |= TF_ECN_SND_ECE; |
| 1936 | TCP_STATINC(TCP_STAT_ECN_CE); |
| 1937 | break; |
| 1938 | case IPTOS_ECN_ECT0: |
| 1939 | TCP_STATINC(TCP_STAT_ECN_ECT); |
| 1940 | break; |
| 1941 | case IPTOS_ECN_ECT1: |
| 1942 | /* XXX: ignore for now -- rpaulo */ |
| 1943 | break; |
| 1944 | } |
| 1945 | /* |
| 1946 | * Congestion experienced. |
| 1947 | * Ignore if we are already trying to recover. |
| 1948 | */ |
| 1949 | if ((tiflags & TH_ECE) && SEQ_GEQ(tp->snd_una, tp->snd_recover)) |
| 1950 | tp->t_congctl->cong_exp(tp); |
| 1951 | } |
| 1952 | |
| 1953 | if (opti.ts_present && opti.ts_ecr) { |
| 1954 | /* |
| 1955 | * Calculate the RTT from the returned time stamp and the |
| 1956 | * connection's time base. If the time stamp is later than |
| 1957 | * the current time, or is extremely old, fall back to non-1323 |
| 1958 | * RTT calculation. Since ts_rtt is unsigned, we can test both |
| 1959 | * at the same time. |
| 1960 | * |
| 1961 | * Note that ts_rtt is in units of slow ticks (500 |
| 1962 | * ms). Since most earthbound RTTs are < 500 ms, |
| 1963 | * observed values will have large quantization noise. |
| 1964 | * Our smoothed RTT is then the fraction of observed |
| 1965 | * samples that are 1 tick instead of 0 (times 500 |
| 1966 | * ms). |
| 1967 | * |
| 1968 | * ts_rtt is increased by 1 to denote a valid sample, |
| 1969 | * with 0 indicating an invalid measurement. This |
| 1970 | * extra 1 must be removed when ts_rtt is used, or |
| 1971 | * else an an erroneous extra 500 ms will result. |
| 1972 | */ |
| 1973 | ts_rtt = TCP_TIMESTAMP(tp) - opti.ts_ecr + 1; |
| 1974 | if (ts_rtt > TCP_PAWS_IDLE) |
| 1975 | ts_rtt = 0; |
| 1976 | } else { |
| 1977 | ts_rtt = 0; |
| 1978 | } |
| 1979 | |
| 1980 | /* |
| 1981 | * Header prediction: check for the two common cases |
| 1982 | * of a uni-directional data xfer. If the packet has |
| 1983 | * no control flags, is in-sequence, the window didn't |
| 1984 | * change and we're not retransmitting, it's a |
| 1985 | * candidate. If the length is zero and the ack moved |
| 1986 | * forward, we're the sender side of the xfer. Just |
| 1987 | * free the data acked & wake any higher level process |
| 1988 | * that was blocked waiting for space. If the length |
| 1989 | * is non-zero and the ack didn't move, we're the |
| 1990 | * receiver side. If we're getting packets in-order |
| 1991 | * (the reassembly queue is empty), add the data to |
| 1992 | * the socket buffer and note that we need a delayed ack. |
| 1993 | */ |
| 1994 | if (tp->t_state == TCPS_ESTABLISHED && |
| 1995 | (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ECE|TH_CWR|TH_ACK)) |
| 1996 | == TH_ACK && |
| 1997 | (!opti.ts_present || TSTMP_GEQ(opti.ts_val, tp->ts_recent)) && |
| 1998 | th->th_seq == tp->rcv_nxt && |
| 1999 | tiwin && tiwin == tp->snd_wnd && |
| 2000 | tp->snd_nxt == tp->snd_max) { |
| 2001 | |
| 2002 | /* |
| 2003 | * If last ACK falls within this segment's sequence numbers, |
| 2004 | * record the timestamp. |
| 2005 | * NOTE that the test is modified according to the latest |
| 2006 | * proposal of the tcplw@cray.com list (Braden 1993/04/26). |
| 2007 | * |
| 2008 | * note that we already know |
| 2009 | * TSTMP_GEQ(opti.ts_val, tp->ts_recent) |
| 2010 | */ |
| 2011 | if (opti.ts_present && |
| 2012 | SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { |
| 2013 | tp->ts_recent_age = tcp_now; |
| 2014 | tp->ts_recent = opti.ts_val; |
| 2015 | } |
| 2016 | |
| 2017 | if (tlen == 0) { |
| 2018 | /* Ack prediction. */ |
| 2019 | if (SEQ_GT(th->th_ack, tp->snd_una) && |
| 2020 | SEQ_LEQ(th->th_ack, tp->snd_max) && |
| 2021 | tp->snd_cwnd >= tp->snd_wnd && |
| 2022 | tp->t_partialacks < 0) { |
| 2023 | /* |
| 2024 | * this is a pure ack for outstanding data. |
| 2025 | */ |
| 2026 | if (ts_rtt) |
| 2027 | tcp_xmit_timer(tp, ts_rtt - 1); |
| 2028 | else if (tp->t_rtttime && |
| 2029 | SEQ_GT(th->th_ack, tp->t_rtseq)) |
| 2030 | tcp_xmit_timer(tp, |
| 2031 | tcp_now - tp->t_rtttime); |
| 2032 | acked = th->th_ack - tp->snd_una; |
| 2033 | tcps = TCP_STAT_GETREF(); |
| 2034 | tcps[TCP_STAT_PREDACK]++; |
| 2035 | tcps[TCP_STAT_RCVACKPACK]++; |
| 2036 | tcps[TCP_STAT_RCVACKBYTE] += acked; |
| 2037 | TCP_STAT_PUTREF(); |
| 2038 | nd6_hint(tp); |
| 2039 | |
| 2040 | if (acked > (tp->t_lastoff - tp->t_inoff)) |
| 2041 | tp->t_lastm = NULL; |
| 2042 | sbdrop(&so->so_snd, acked); |
| 2043 | tp->t_lastoff -= acked; |
| 2044 | |
| 2045 | icmp_check(tp, th, acked); |
| 2046 | |
| 2047 | tp->snd_una = th->th_ack; |
| 2048 | tp->snd_fack = tp->snd_una; |
| 2049 | if (SEQ_LT(tp->snd_high, tp->snd_una)) |
| 2050 | tp->snd_high = tp->snd_una; |
| 2051 | m_freem(m); |
| 2052 | |
| 2053 | /* |
| 2054 | * If all outstanding data are acked, stop |
| 2055 | * retransmit timer, otherwise restart timer |
| 2056 | * using current (possibly backed-off) value. |
| 2057 | * If process is waiting for space, |
| 2058 | * wakeup/selnotify/signal. If data |
| 2059 | * are ready to send, let tcp_output |
| 2060 | * decide between more output or persist. |
| 2061 | */ |
| 2062 | if (tp->snd_una == tp->snd_max) |
| 2063 | TCP_TIMER_DISARM(tp, TCPT_REXMT); |
| 2064 | else if (TCP_TIMER_ISARMED(tp, |
| 2065 | TCPT_PERSIST) == 0) |
| 2066 | TCP_TIMER_ARM(tp, TCPT_REXMT, |
| 2067 | tp->t_rxtcur); |
| 2068 | |
| 2069 | sowwakeup(so); |
| 2070 | if (so->so_snd.sb_cc) { |
| 2071 | KERNEL_LOCK(1, NULL); |
| 2072 | (void) tcp_output(tp); |
| 2073 | KERNEL_UNLOCK_ONE(NULL); |
| 2074 | } |
| 2075 | if (tcp_saveti) |
| 2076 | m_freem(tcp_saveti); |
| 2077 | return; |
| 2078 | } |
| 2079 | } else if (th->th_ack == tp->snd_una && |
| 2080 | TAILQ_FIRST(&tp->segq) == NULL && |
| 2081 | tlen <= sbspace(&so->so_rcv)) { |
| 2082 | int newsize = 0; /* automatic sockbuf scaling */ |
| 2083 | |
| 2084 | /* |
| 2085 | * this is a pure, in-sequence data packet |
| 2086 | * with nothing on the reassembly queue and |
| 2087 | * we have enough buffer space to take it. |
| 2088 | */ |
| 2089 | tp->rcv_nxt += tlen; |
| 2090 | tcps = TCP_STAT_GETREF(); |
| 2091 | tcps[TCP_STAT_PREDDAT]++; |
| 2092 | tcps[TCP_STAT_RCVPACK]++; |
| 2093 | tcps[TCP_STAT_RCVBYTE] += tlen; |
| 2094 | TCP_STAT_PUTREF(); |
| 2095 | nd6_hint(tp); |
| 2096 | |
| 2097 | /* |
| 2098 | * Automatic sizing enables the performance of large buffers |
| 2099 | * and most of the efficiency of small ones by only allocating |
| 2100 | * space when it is needed. |
| 2101 | * |
| 2102 | * On the receive side the socket buffer memory is only rarely |
| 2103 | * used to any significant extent. This allows us to be much |
| 2104 | * more aggressive in scaling the receive socket buffer. For |
| 2105 | * the case that the buffer space is actually used to a large |
| 2106 | * extent and we run out of kernel memory we can simply drop |
| 2107 | * the new segments; TCP on the sender will just retransmit it |
| 2108 | * later. Setting the buffer size too big may only consume too |
| 2109 | * much kernel memory if the application doesn't read() from |
| 2110 | * the socket or packet loss or reordering makes use of the |
| 2111 | * reassembly queue. |
| 2112 | * |
| 2113 | * The criteria to step up the receive buffer one notch are: |
| 2114 | * 1. the number of bytes received during the time it takes |
| 2115 | * one timestamp to be reflected back to us (the RTT); |
| 2116 | * 2. received bytes per RTT is within seven eighth of the |
| 2117 | * current socket buffer size; |
| 2118 | * 3. receive buffer size has not hit maximal automatic size; |
| 2119 | * |
| 2120 | * This algorithm does one step per RTT at most and only if |
| 2121 | * we receive a bulk stream w/o packet losses or reorderings. |
| 2122 | * Shrinking the buffer during idle times is not necessary as |
| 2123 | * it doesn't consume any memory when idle. |
| 2124 | * |
| 2125 | * TODO: Only step up if the application is actually serving |
| 2126 | * the buffer to better manage the socket buffer resources. |
| 2127 | */ |
| 2128 | if (tcp_do_autorcvbuf && |
| 2129 | opti.ts_ecr && |
| 2130 | (so->so_rcv.sb_flags & SB_AUTOSIZE)) { |
| 2131 | if (opti.ts_ecr > tp->rfbuf_ts && |
| 2132 | opti.ts_ecr - tp->rfbuf_ts < PR_SLOWHZ) { |
| 2133 | if (tp->rfbuf_cnt > |
| 2134 | (so->so_rcv.sb_hiwat / 8 * 7) && |
| 2135 | so->so_rcv.sb_hiwat < |
| 2136 | tcp_autorcvbuf_max) { |
| 2137 | newsize = |
| 2138 | min(so->so_rcv.sb_hiwat + |
| 2139 | tcp_autorcvbuf_inc, |
| 2140 | tcp_autorcvbuf_max); |
| 2141 | } |
| 2142 | /* Start over with next RTT. */ |
| 2143 | tp->rfbuf_ts = 0; |
| 2144 | tp->rfbuf_cnt = 0; |
| 2145 | } else |
| 2146 | tp->rfbuf_cnt += tlen; /* add up */ |
| 2147 | } |
| 2148 | |
| 2149 | /* |
| 2150 | * Drop TCP, IP headers and TCP options then add data |
| 2151 | * to socket buffer. |
| 2152 | */ |
| 2153 | if (so->so_state & SS_CANTRCVMORE) |
| 2154 | m_freem(m); |
| 2155 | else { |
| 2156 | /* |
| 2157 | * Set new socket buffer size. |
| 2158 | * Give up when limit is reached. |
| 2159 | */ |
| 2160 | if (newsize) |
| 2161 | if (!sbreserve(&so->so_rcv, |
| 2162 | newsize, so)) |
| 2163 | so->so_rcv.sb_flags &= ~SB_AUTOSIZE; |
| 2164 | m_adj(m, toff + off); |
| 2165 | sbappendstream(&so->so_rcv, m); |
| 2166 | } |
| 2167 | sorwakeup(so); |
| 2168 | tcp_setup_ack(tp, th); |
| 2169 | if (tp->t_flags & TF_ACKNOW) { |
| 2170 | KERNEL_LOCK(1, NULL); |
| 2171 | (void) tcp_output(tp); |
| 2172 | KERNEL_UNLOCK_ONE(NULL); |
| 2173 | } |
| 2174 | if (tcp_saveti) |
| 2175 | m_freem(tcp_saveti); |
| 2176 | return; |
| 2177 | } |
| 2178 | } |
| 2179 | |
| 2180 | /* |
| 2181 | * Compute mbuf offset to TCP data segment. |
| 2182 | */ |
| 2183 | hdroptlen = toff + off; |
| 2184 | |
| 2185 | /* |
| 2186 | * Calculate amount of space in receive window, |
| 2187 | * and then do TCP input processing. |
| 2188 | * Receive window is amount of space in rcv queue, |
| 2189 | * but not less than advertised window. |
| 2190 | */ |
| 2191 | { int win; |
| 2192 | |
| 2193 | win = sbspace(&so->so_rcv); |
| 2194 | if (win < 0) |
| 2195 | win = 0; |
| 2196 | tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); |
| 2197 | } |
| 2198 | |
| 2199 | /* Reset receive buffer auto scaling when not in bulk receive mode. */ |
| 2200 | tp->rfbuf_ts = 0; |
| 2201 | tp->rfbuf_cnt = 0; |
| 2202 | |
| 2203 | switch (tp->t_state) { |
| 2204 | /* |
| 2205 | * If the state is SYN_SENT: |
| 2206 | * if seg contains an ACK, but not for our SYN, drop the input. |
| 2207 | * if seg contains a RST, then drop the connection. |
| 2208 | * if seg does not contain SYN, then drop it. |
| 2209 | * Otherwise this is an acceptable SYN segment |
| 2210 | * initialize tp->rcv_nxt and tp->irs |
| 2211 | * if seg contains ack then advance tp->snd_una |
| 2212 | * if seg contains a ECE and ECN support is enabled, the stream |
| 2213 | * is ECN capable. |
| 2214 | * if SYN has been acked change to ESTABLISHED else SYN_RCVD state |
| 2215 | * arrange for segment to be acked (eventually) |
| 2216 | * continue processing rest of data/controls, beginning with URG |
| 2217 | */ |
| 2218 | case TCPS_SYN_SENT: |
| 2219 | if ((tiflags & TH_ACK) && |
| 2220 | (SEQ_LEQ(th->th_ack, tp->iss) || |
| 2221 | SEQ_GT(th->th_ack, tp->snd_max))) |
| 2222 | goto dropwithreset; |
| 2223 | if (tiflags & TH_RST) { |
| 2224 | if (tiflags & TH_ACK) |
| 2225 | tp = tcp_drop(tp, ECONNREFUSED); |
| 2226 | goto drop; |
| 2227 | } |
| 2228 | if ((tiflags & TH_SYN) == 0) |
| 2229 | goto drop; |
| 2230 | if (tiflags & TH_ACK) { |
| 2231 | tp->snd_una = th->th_ack; |
| 2232 | if (SEQ_LT(tp->snd_nxt, tp->snd_una)) |
| 2233 | tp->snd_nxt = tp->snd_una; |
| 2234 | if (SEQ_LT(tp->snd_high, tp->snd_una)) |
| 2235 | tp->snd_high = tp->snd_una; |
| 2236 | TCP_TIMER_DISARM(tp, TCPT_REXMT); |
| 2237 | |
| 2238 | if ((tiflags & TH_ECE) && tcp_do_ecn) { |
| 2239 | tp->t_flags |= TF_ECN_PERMIT; |
| 2240 | TCP_STATINC(TCP_STAT_ECN_SHS); |
| 2241 | } |
| 2242 | |
| 2243 | } |
| 2244 | tp->irs = th->th_seq; |
| 2245 | tcp_rcvseqinit(tp); |
| 2246 | tp->t_flags |= TF_ACKNOW; |
| 2247 | tcp_mss_from_peer(tp, opti.maxseg); |
| 2248 | |
| 2249 | /* |
| 2250 | * Initialize the initial congestion window. If we |
| 2251 | * had to retransmit the SYN, we must initialize cwnd |
| 2252 | * to 1 segment (i.e. the Loss Window). |
| 2253 | */ |
| 2254 | if (tp->t_flags & TF_SYN_REXMT) |
| 2255 | tp->snd_cwnd = tp->t_peermss; |
| 2256 | else { |
| 2257 | int ss = tcp_init_win; |
| 2258 | #ifdef INET |
| 2259 | if (inp != NULL && in_localaddr(inp->inp_faddr)) |
| 2260 | ss = tcp_init_win_local; |
| 2261 | #endif |
| 2262 | #ifdef INET6 |
| 2263 | if (in6p != NULL && in6_localaddr(&in6p->in6p_faddr)) |
| 2264 | ss = tcp_init_win_local; |
| 2265 | #endif |
| 2266 | tp->snd_cwnd = TCP_INITIAL_WINDOW(ss, tp->t_peermss); |
| 2267 | } |
| 2268 | |
| 2269 | tcp_rmx_rtt(tp); |
| 2270 | if (tiflags & TH_ACK) { |
| 2271 | TCP_STATINC(TCP_STAT_CONNECTS); |
| 2272 | /* |
| 2273 | * move tcp_established before soisconnected |
| 2274 | * because upcall handler can drive tcp_output |
| 2275 | * functionality. |
| 2276 | * XXX we might call soisconnected at the end of |
| 2277 | * all processing |
| 2278 | */ |
| 2279 | tcp_established(tp); |
| 2280 | soisconnected(so); |
| 2281 | /* Do window scaling on this connection? */ |
| 2282 | if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == |
| 2283 | (TF_RCVD_SCALE|TF_REQ_SCALE)) { |
| 2284 | tp->snd_scale = tp->requested_s_scale; |
| 2285 | tp->rcv_scale = tp->request_r_scale; |
| 2286 | } |
| 2287 | TCP_REASS_LOCK(tp); |
| 2288 | (void) tcp_reass(tp, NULL, NULL, &tlen); |
| 2289 | /* |
| 2290 | * if we didn't have to retransmit the SYN, |
| 2291 | * use its rtt as our initial srtt & rtt var. |
| 2292 | */ |
| 2293 | if (tp->t_rtttime) |
| 2294 | tcp_xmit_timer(tp, tcp_now - tp->t_rtttime); |
| 2295 | } else |
| 2296 | tp->t_state = TCPS_SYN_RECEIVED; |
| 2297 | |
| 2298 | /* |
| 2299 | * Advance th->th_seq to correspond to first data byte. |
| 2300 | * If data, trim to stay within window, |
| 2301 | * dropping FIN if necessary. |
| 2302 | */ |
| 2303 | th->th_seq++; |
| 2304 | if (tlen > tp->rcv_wnd) { |
| 2305 | todrop = tlen - tp->rcv_wnd; |
| 2306 | m_adj(m, -todrop); |
| 2307 | tlen = tp->rcv_wnd; |
| 2308 | tiflags &= ~TH_FIN; |
| 2309 | tcps = TCP_STAT_GETREF(); |
| 2310 | tcps[TCP_STAT_RCVPACKAFTERWIN]++; |
| 2311 | tcps[TCP_STAT_RCVBYTEAFTERWIN] += todrop; |
| 2312 | TCP_STAT_PUTREF(); |
| 2313 | } |
| 2314 | tp->snd_wl1 = th->th_seq - 1; |
| 2315 | tp->rcv_up = th->th_seq; |
| 2316 | goto step6; |
| 2317 | |
| 2318 | /* |
| 2319 | * If the state is SYN_RECEIVED: |
| 2320 | * If seg contains an ACK, but not for our SYN, drop the input |
| 2321 | * and generate an RST. See page 36, rfc793 |
| 2322 | */ |
| 2323 | case TCPS_SYN_RECEIVED: |
| 2324 | if ((tiflags & TH_ACK) && |
| 2325 | (SEQ_LEQ(th->th_ack, tp->iss) || |
| 2326 | SEQ_GT(th->th_ack, tp->snd_max))) |
| 2327 | goto dropwithreset; |
| 2328 | break; |
| 2329 | } |
| 2330 | |
| 2331 | /* |
| 2332 | * States other than LISTEN or SYN_SENT. |
| 2333 | * First check timestamp, if present. |
| 2334 | * Then check that at least some bytes of segment are within |
| 2335 | * receive window. If segment begins before rcv_nxt, |
| 2336 | * drop leading data (and SYN); if nothing left, just ack. |
| 2337 | * |
| 2338 | * RFC 1323 PAWS: If we have a timestamp reply on this segment |
| 2339 | * and it's less than ts_recent, drop it. |
| 2340 | */ |
| 2341 | if (opti.ts_present && (tiflags & TH_RST) == 0 && tp->ts_recent && |
| 2342 | TSTMP_LT(opti.ts_val, tp->ts_recent)) { |
| 2343 | |
| 2344 | /* Check to see if ts_recent is over 24 days old. */ |
| 2345 | if (tcp_now - tp->ts_recent_age > TCP_PAWS_IDLE) { |
| 2346 | /* |
| 2347 | * Invalidate ts_recent. If this segment updates |
| 2348 | * ts_recent, the age will be reset later and ts_recent |
| 2349 | * will get a valid value. If it does not, setting |
| 2350 | * ts_recent to zero will at least satisfy the |
| 2351 | * requirement that zero be placed in the timestamp |
| 2352 | * echo reply when ts_recent isn't valid. The |
| 2353 | * age isn't reset until we get a valid ts_recent |
| 2354 | * because we don't want out-of-order segments to be |
| 2355 | * dropped when ts_recent is old. |
| 2356 | */ |
| 2357 | tp->ts_recent = 0; |
| 2358 | } else { |
| 2359 | tcps = TCP_STAT_GETREF(); |
| 2360 | tcps[TCP_STAT_RCVDUPPACK]++; |
| 2361 | tcps[TCP_STAT_RCVDUPBYTE] += tlen; |
| 2362 | tcps[TCP_STAT_PAWSDROP]++; |
| 2363 | TCP_STAT_PUTREF(); |
| 2364 | tcp_new_dsack(tp, th->th_seq, tlen); |
| 2365 | goto dropafterack; |
| 2366 | } |
| 2367 | } |
| 2368 | |
| 2369 | todrop = tp->rcv_nxt - th->th_seq; |
| 2370 | dupseg = false; |
| 2371 | if (todrop > 0) { |
| 2372 | if (tiflags & TH_SYN) { |
| 2373 | tiflags &= ~TH_SYN; |
| 2374 | th->th_seq++; |
| 2375 | if (th->th_urp > 1) |
| 2376 | th->th_urp--; |
| 2377 | else { |
| 2378 | tiflags &= ~TH_URG; |
| 2379 | th->th_urp = 0; |
| 2380 | } |
| 2381 | todrop--; |
| 2382 | } |
| 2383 | if (todrop > tlen || |
| 2384 | (todrop == tlen && (tiflags & TH_FIN) == 0)) { |
| 2385 | /* |
| 2386 | * Any valid FIN or RST must be to the left of the |
| 2387 | * window. At this point the FIN or RST must be a |
| 2388 | * duplicate or out of sequence; drop it. |
| 2389 | */ |
| 2390 | if (tiflags & TH_RST) |
| 2391 | goto drop; |
| 2392 | tiflags &= ~(TH_FIN|TH_RST); |
| 2393 | /* |
| 2394 | * Send an ACK to resynchronize and drop any data. |
| 2395 | * But keep on processing for RST or ACK. |
| 2396 | */ |
| 2397 | tp->t_flags |= TF_ACKNOW; |
| 2398 | todrop = tlen; |
| 2399 | dupseg = true; |
| 2400 | tcps = TCP_STAT_GETREF(); |
| 2401 | tcps[TCP_STAT_RCVDUPPACK]++; |
| 2402 | tcps[TCP_STAT_RCVDUPBYTE] += todrop; |
| 2403 | TCP_STAT_PUTREF(); |
| 2404 | } else if ((tiflags & TH_RST) && |
| 2405 | th->th_seq != tp->rcv_nxt) { |
| 2406 | /* |
| 2407 | * Test for reset before adjusting the sequence |
| 2408 | * number for overlapping data. |
| 2409 | */ |
| 2410 | goto dropafterack_ratelim; |
| 2411 | } else { |
| 2412 | tcps = TCP_STAT_GETREF(); |
| 2413 | tcps[TCP_STAT_RCVPARTDUPPACK]++; |
| 2414 | tcps[TCP_STAT_RCVPARTDUPBYTE] += todrop; |
| 2415 | TCP_STAT_PUTREF(); |
| 2416 | } |
| 2417 | tcp_new_dsack(tp, th->th_seq, todrop); |
| 2418 | hdroptlen += todrop; /*drop from head afterwards*/ |
| 2419 | th->th_seq += todrop; |
| 2420 | tlen -= todrop; |
| 2421 | if (th->th_urp > todrop) |
| 2422 | th->th_urp -= todrop; |
| 2423 | else { |
| 2424 | tiflags &= ~TH_URG; |
| 2425 | th->th_urp = 0; |
| 2426 | } |
| 2427 | } |
| 2428 | |
| 2429 | /* |
| 2430 | * If new data are received on a connection after the |
| 2431 | * user processes are gone, then RST the other end. |
| 2432 | */ |
| 2433 | if ((so->so_state & SS_NOFDREF) && |
| 2434 | tp->t_state > TCPS_CLOSE_WAIT && tlen) { |
| 2435 | tp = tcp_close(tp); |
| 2436 | TCP_STATINC(TCP_STAT_RCVAFTERCLOSE); |
| 2437 | goto dropwithreset; |
| 2438 | } |
| 2439 | |
| 2440 | /* |
| 2441 | * If segment ends after window, drop trailing data |
| 2442 | * (and PUSH and FIN); if nothing left, just ACK. |
| 2443 | */ |
| 2444 | todrop = (th->th_seq + tlen) - (tp->rcv_nxt+tp->rcv_wnd); |
| 2445 | if (todrop > 0) { |
| 2446 | TCP_STATINC(TCP_STAT_RCVPACKAFTERWIN); |
| 2447 | if (todrop >= tlen) { |
| 2448 | /* |
| 2449 | * The segment actually starts after the window. |
| 2450 | * th->th_seq + tlen - tp->rcv_nxt - tp->rcv_wnd >= tlen |
| 2451 | * th->th_seq - tp->rcv_nxt - tp->rcv_wnd >= 0 |
| 2452 | * th->th_seq >= tp->rcv_nxt + tp->rcv_wnd |
| 2453 | */ |
| 2454 | TCP_STATADD(TCP_STAT_RCVBYTEAFTERWIN, tlen); |
| 2455 | /* |
| 2456 | * If a new connection request is received |
| 2457 | * while in TIME_WAIT, drop the old connection |
| 2458 | * and start over if the sequence numbers |
| 2459 | * are above the previous ones. |
| 2460 | * |
| 2461 | * NOTE: We will checksum the packet again, and |
| 2462 | * so we need to put the header fields back into |
| 2463 | * network order! |
| 2464 | * XXX This kind of sucks, but we don't expect |
| 2465 | * XXX this to happen very often, so maybe it |
| 2466 | * XXX doesn't matter so much. |
| 2467 | */ |
| 2468 | if (tiflags & TH_SYN && |
| 2469 | tp->t_state == TCPS_TIME_WAIT && |
| 2470 | SEQ_GT(th->th_seq, tp->rcv_nxt)) { |
| 2471 | tp = tcp_close(tp); |
| 2472 | tcp_fields_to_net(th); |
| 2473 | goto findpcb; |
| 2474 | } |
| 2475 | /* |
| 2476 | * If window is closed can only take segments at |
| 2477 | * window edge, and have to drop data and PUSH from |
| 2478 | * incoming segments. Continue processing, but |
| 2479 | * remember to ack. Otherwise, drop segment |
| 2480 | * and (if not RST) ack. |
| 2481 | */ |
| 2482 | if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { |
| 2483 | tp->t_flags |= TF_ACKNOW; |
| 2484 | TCP_STATINC(TCP_STAT_RCVWINPROBE); |
| 2485 | } else |
| 2486 | goto dropafterack; |
| 2487 | } else |
| 2488 | TCP_STATADD(TCP_STAT_RCVBYTEAFTERWIN, todrop); |
| 2489 | m_adj(m, -todrop); |
| 2490 | tlen -= todrop; |
| 2491 | tiflags &= ~(TH_PUSH|TH_FIN); |
| 2492 | } |
| 2493 | |
| 2494 | /* |
| 2495 | * If last ACK falls within this segment's sequence numbers, |
| 2496 | * record the timestamp. |
| 2497 | * NOTE: |
| 2498 | * 1) That the test incorporates suggestions from the latest |
| 2499 | * proposal of the tcplw@cray.com list (Braden 1993/04/26). |
| 2500 | * 2) That updating only on newer timestamps interferes with |
| 2501 | * our earlier PAWS tests, so this check should be solely |
| 2502 | * predicated on the sequence space of this segment. |
| 2503 | * 3) That we modify the segment boundary check to be |
| 2504 | * Last.ACK.Sent <= SEG.SEQ + SEG.Len |
| 2505 | * instead of RFC1323's |
| 2506 | * Last.ACK.Sent < SEG.SEQ + SEG.Len, |
| 2507 | * This modified check allows us to overcome RFC1323's |
| 2508 | * limitations as described in Stevens TCP/IP Illustrated |
| 2509 | * Vol. 2 p.869. In such cases, we can still calculate the |
| 2510 | * RTT correctly when RCV.NXT == Last.ACK.Sent. |
| 2511 | */ |
| 2512 | if (opti.ts_present && |
| 2513 | SEQ_LEQ(th->th_seq, tp->last_ack_sent) && |
| 2514 | SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + |
| 2515 | ((tiflags & (TH_SYN|TH_FIN)) != 0))) { |
| 2516 | tp->ts_recent_age = tcp_now; |
| 2517 | tp->ts_recent = opti.ts_val; |
| 2518 | } |
| 2519 | |
| 2520 | /* |
| 2521 | * If the RST bit is set examine the state: |
| 2522 | * SYN_RECEIVED STATE: |
| 2523 | * If passive open, return to LISTEN state. |
| 2524 | * If active open, inform user that connection was refused. |
| 2525 | * ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES: |
| 2526 | * Inform user that connection was reset, and close tcb. |
| 2527 | * CLOSING, LAST_ACK, TIME_WAIT STATES |
| 2528 | * Close the tcb. |
| 2529 | */ |
| 2530 | if (tiflags & TH_RST) { |
| 2531 | if (th->th_seq != tp->rcv_nxt) |
| 2532 | goto dropafterack_ratelim; |
| 2533 | |
| 2534 | switch (tp->t_state) { |
| 2535 | case TCPS_SYN_RECEIVED: |
| 2536 | so->so_error = ECONNREFUSED; |
| 2537 | goto close; |
| 2538 | |
| 2539 | case TCPS_ESTABLISHED: |
| 2540 | case TCPS_FIN_WAIT_1: |
| 2541 | case TCPS_FIN_WAIT_2: |
| 2542 | case TCPS_CLOSE_WAIT: |
| 2543 | so->so_error = ECONNRESET; |
| 2544 | close: |
| 2545 | tp->t_state = TCPS_CLOSED; |
| 2546 | TCP_STATINC(TCP_STAT_DROPS); |
| 2547 | tp = tcp_close(tp); |
| 2548 | goto drop; |
| 2549 | |
| 2550 | case TCPS_CLOSING: |
| 2551 | case TCPS_LAST_ACK: |
| 2552 | case TCPS_TIME_WAIT: |
| 2553 | tp = tcp_close(tp); |
| 2554 | goto drop; |
| 2555 | } |
| 2556 | } |
| 2557 | |
| 2558 | /* |
| 2559 | * Since we've covered the SYN-SENT and SYN-RECEIVED states above |
| 2560 | * we must be in a synchronized state. RFC791 states (under RST |
| 2561 | * generation) that any unacceptable segment (an out-of-order SYN |
| 2562 | * qualifies) received in a synchronized state must elicit only an |
| 2563 | * empty acknowledgment segment ... and the connection remains in |
| 2564 | * the same state. |
| 2565 | */ |
| 2566 | if (tiflags & TH_SYN) { |
| 2567 | if (tp->rcv_nxt == th->th_seq) { |
| 2568 | tcp_respond(tp, m, m, th, (tcp_seq)0, th->th_ack - 1, |
| 2569 | TH_ACK); |
| 2570 | if (tcp_saveti) |
| 2571 | m_freem(tcp_saveti); |
| 2572 | return; |
| 2573 | } |
| 2574 | |
| 2575 | goto dropafterack_ratelim; |
| 2576 | } |
| 2577 | |
| 2578 | /* |
| 2579 | * If the ACK bit is off we drop the segment and return. |
| 2580 | */ |
| 2581 | if ((tiflags & TH_ACK) == 0) { |
| 2582 | if (tp->t_flags & TF_ACKNOW) |
| 2583 | goto dropafterack; |
| 2584 | else |
| 2585 | goto drop; |
| 2586 | } |
| 2587 | |
| 2588 | /* |
| 2589 | * Ack processing. |
| 2590 | */ |
| 2591 | switch (tp->t_state) { |
| 2592 | |
| 2593 | /* |
| 2594 | * In SYN_RECEIVED state if the ack ACKs our SYN then enter |
| 2595 | * ESTABLISHED state and continue processing, otherwise |
| 2596 | * send an RST. |
| 2597 | */ |
| 2598 | case TCPS_SYN_RECEIVED: |
| 2599 | if (SEQ_GT(tp->snd_una, th->th_ack) || |
| 2600 | SEQ_GT(th->th_ack, tp->snd_max)) |
| 2601 | goto dropwithreset; |
| 2602 | TCP_STATINC(TCP_STAT_CONNECTS); |
| 2603 | soisconnected(so); |
| 2604 | tcp_established(tp); |
| 2605 | /* Do window scaling? */ |
| 2606 | if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == |
| 2607 | (TF_RCVD_SCALE|TF_REQ_SCALE)) { |
| 2608 | tp->snd_scale = tp->requested_s_scale; |
| 2609 | tp->rcv_scale = tp->request_r_scale; |
| 2610 | } |
| 2611 | TCP_REASS_LOCK(tp); |
| 2612 | (void) tcp_reass(tp, NULL, NULL, &tlen); |
| 2613 | tp->snd_wl1 = th->th_seq - 1; |
| 2614 | /* fall into ... */ |
| 2615 | |
| 2616 | /* |
| 2617 | * In ESTABLISHED state: drop duplicate ACKs; ACK out of range |
| 2618 | * ACKs. If the ack is in the range |
| 2619 | * tp->snd_una < th->th_ack <= tp->snd_max |
| 2620 | * then advance tp->snd_una to th->th_ack and drop |
| 2621 | * data from the retransmission queue. If this ACK reflects |
| 2622 | * more up to date window information we update our window information. |
| 2623 | */ |
| 2624 | case TCPS_ESTABLISHED: |
| 2625 | case TCPS_FIN_WAIT_1: |
| 2626 | case TCPS_FIN_WAIT_2: |
| 2627 | case TCPS_CLOSE_WAIT: |
| 2628 | case TCPS_CLOSING: |
| 2629 | case TCPS_LAST_ACK: |
| 2630 | case TCPS_TIME_WAIT: |
| 2631 | |
| 2632 | if (SEQ_LEQ(th->th_ack, tp->snd_una)) { |
| 2633 | if (tlen == 0 && !dupseg && tiwin == tp->snd_wnd) { |
| 2634 | TCP_STATINC(TCP_STAT_RCVDUPACK); |
| 2635 | /* |
| 2636 | * If we have outstanding data (other than |
| 2637 | * a window probe), this is a completely |
| 2638 | * duplicate ack (ie, window info didn't |
| 2639 | * change), the ack is the biggest we've |
| 2640 | * seen and we've seen exactly our rexmt |
| 2641 | * threshhold of them, assume a packet |
| 2642 | * has been dropped and retransmit it. |
| 2643 | * Kludge snd_nxt & the congestion |
| 2644 | * window so we send only this one |
| 2645 | * packet. |
| 2646 | */ |
| 2647 | if (TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0 || |
| 2648 | th->th_ack != tp->snd_una) |
| 2649 | tp->t_dupacks = 0; |
| 2650 | else if (tp->t_partialacks < 0 && |
| 2651 | (++tp->t_dupacks == tcprexmtthresh || |
| 2652 | TCP_FACK_FASTRECOV(tp))) { |
| 2653 | /* |
| 2654 | * Do the fast retransmit, and adjust |
| 2655 | * congestion control paramenters. |
| 2656 | */ |
| 2657 | if (tp->t_congctl->fast_retransmit(tp, th)) { |
| 2658 | /* False fast retransmit */ |
| 2659 | break; |
| 2660 | } else |
| 2661 | goto drop; |
| 2662 | } else if (tp->t_dupacks > tcprexmtthresh) { |
| 2663 | tp->snd_cwnd += tp->t_segsz; |
| 2664 | KERNEL_LOCK(1, NULL); |
| 2665 | (void) tcp_output(tp); |
| 2666 | KERNEL_UNLOCK_ONE(NULL); |
| 2667 | goto drop; |
| 2668 | } |
| 2669 | } else { |
| 2670 | /* |
| 2671 | * If the ack appears to be very old, only |
| 2672 | * allow data that is in-sequence. This |
| 2673 | * makes it somewhat more difficult to insert |
| 2674 | * forged data by guessing sequence numbers. |
| 2675 | * Sent an ack to try to update the send |
| 2676 | * sequence number on the other side. |
| 2677 | */ |
| 2678 | if (tlen && th->th_seq != tp->rcv_nxt && |
| 2679 | SEQ_LT(th->th_ack, |
| 2680 | tp->snd_una - tp->max_sndwnd)) |
| 2681 | goto dropafterack; |
| 2682 | } |
| 2683 | break; |
| 2684 | } |
| 2685 | /* |
| 2686 | * If the congestion window was inflated to account |
| 2687 | * for the other side's cached packets, retract it. |
| 2688 | */ |
| 2689 | tp->t_congctl->fast_retransmit_newack(tp, th); |
| 2690 | |
| 2691 | if (SEQ_GT(th->th_ack, tp->snd_max)) { |
| 2692 | TCP_STATINC(TCP_STAT_RCVACKTOOMUCH); |
| 2693 | goto dropafterack; |
| 2694 | } |
| 2695 | acked = th->th_ack - tp->snd_una; |
| 2696 | tcps = TCP_STAT_GETREF(); |
| 2697 | tcps[TCP_STAT_RCVACKPACK]++; |
| 2698 | tcps[TCP_STAT_RCVACKBYTE] += acked; |
| 2699 | TCP_STAT_PUTREF(); |
| 2700 | |
| 2701 | /* |
| 2702 | * If we have a timestamp reply, update smoothed |
| 2703 | * round trip time. If no timestamp is present but |
| 2704 | * transmit timer is running and timed sequence |
| 2705 | * number was acked, update smoothed round trip time. |
| 2706 | * Since we now have an rtt measurement, cancel the |
| 2707 | * timer backoff (cf., Phil Karn's retransmit alg.). |
| 2708 | * Recompute the initial retransmit timer. |
| 2709 | */ |
| 2710 | if (ts_rtt) |
| 2711 | tcp_xmit_timer(tp, ts_rtt - 1); |
| 2712 | else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) |
| 2713 | tcp_xmit_timer(tp, tcp_now - tp->t_rtttime); |
| 2714 | |
| 2715 | /* |
| 2716 | * If all outstanding data is acked, stop retransmit |
| 2717 | * timer and remember to restart (more output or persist). |
| 2718 | * If there is more data to be acked, restart retransmit |
| 2719 | * timer, using current (possibly backed-off) value. |
| 2720 | */ |
| 2721 | if (th->th_ack == tp->snd_max) { |
| 2722 | TCP_TIMER_DISARM(tp, TCPT_REXMT); |
| 2723 | needoutput = 1; |
| 2724 | } else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) |
| 2725 | TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); |
| 2726 | |
| 2727 | /* |
| 2728 | * New data has been acked, adjust the congestion window. |
| 2729 | */ |
| 2730 | tp->t_congctl->newack(tp, th); |
| 2731 | |
| 2732 | nd6_hint(tp); |
| 2733 | if (acked > so->so_snd.sb_cc) { |
| 2734 | tp->snd_wnd -= so->so_snd.sb_cc; |
| 2735 | sbdrop(&so->so_snd, (int)so->so_snd.sb_cc); |
| 2736 | ourfinisacked = 1; |
| 2737 | } else { |
| 2738 | if (acked > (tp->t_lastoff - tp->t_inoff)) |
| 2739 | tp->t_lastm = NULL; |
| 2740 | sbdrop(&so->so_snd, acked); |
| 2741 | tp->t_lastoff -= acked; |
| 2742 | if (tp->snd_wnd > acked) |
| 2743 | tp->snd_wnd -= acked; |
| 2744 | else |
| 2745 | tp->snd_wnd = 0; |
| 2746 | ourfinisacked = 0; |
| 2747 | } |
| 2748 | sowwakeup(so); |
| 2749 | |
| 2750 | icmp_check(tp, th, acked); |
| 2751 | |
| 2752 | tp->snd_una = th->th_ack; |
| 2753 | if (SEQ_GT(tp->snd_una, tp->snd_fack)) |
| 2754 | tp->snd_fack = tp->snd_una; |
| 2755 | if (SEQ_LT(tp->snd_nxt, tp->snd_una)) |
| 2756 | tp->snd_nxt = tp->snd_una; |
| 2757 | if (SEQ_LT(tp->snd_high, tp->snd_una)) |
| 2758 | tp->snd_high = tp->snd_una; |
| 2759 | |
| 2760 | switch (tp->t_state) { |
| 2761 | |
| 2762 | /* |
| 2763 | * In FIN_WAIT_1 STATE in addition to the processing |
| 2764 | * for the ESTABLISHED state if our FIN is now acknowledged |
| 2765 | * then enter FIN_WAIT_2. |
| 2766 | */ |
| 2767 | case TCPS_FIN_WAIT_1: |
| 2768 | if (ourfinisacked) { |
| 2769 | /* |
| 2770 | * If we can't receive any more |
| 2771 | * data, then closing user can proceed. |
| 2772 | * Starting the timer is contrary to the |
| 2773 | * specification, but if we don't get a FIN |
| 2774 | * we'll hang forever. |
| 2775 | */ |
| 2776 | if (so->so_state & SS_CANTRCVMORE) { |
| 2777 | soisdisconnected(so); |
| 2778 | if (tp->t_maxidle > 0) |
| 2779 | TCP_TIMER_ARM(tp, TCPT_2MSL, |
| 2780 | tp->t_maxidle); |
| 2781 | } |
| 2782 | tp->t_state = TCPS_FIN_WAIT_2; |
| 2783 | } |
| 2784 | break; |
| 2785 | |
| 2786 | /* |
| 2787 | * In CLOSING STATE in addition to the processing for |
| 2788 | * the ESTABLISHED state if the ACK acknowledges our FIN |
| 2789 | * then enter the TIME-WAIT state, otherwise ignore |
| 2790 | * the segment. |
| 2791 | */ |
| 2792 | case TCPS_CLOSING: |
| 2793 | if (ourfinisacked) { |
| 2794 | tp->t_state = TCPS_TIME_WAIT; |
| 2795 | tcp_canceltimers(tp); |
| 2796 | TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * tp->t_msl); |
| 2797 | soisdisconnected(so); |
| 2798 | } |
| 2799 | break; |
| 2800 | |
| 2801 | /* |
| 2802 | * In LAST_ACK, we may still be waiting for data to drain |
| 2803 | * and/or to be acked, as well as for the ack of our FIN. |
| 2804 | * If our FIN is now acknowledged, delete the TCB, |
| 2805 | * enter the closed state and return. |
| 2806 | */ |
| 2807 | case TCPS_LAST_ACK: |
| 2808 | if (ourfinisacked) { |
| 2809 | tp = tcp_close(tp); |
| 2810 | goto drop; |
| 2811 | } |
| 2812 | break; |
| 2813 | |
| 2814 | /* |
| 2815 | * In TIME_WAIT state the only thing that should arrive |
| 2816 | * is a retransmission of the remote FIN. Acknowledge |
| 2817 | * it and restart the finack timer. |
| 2818 | */ |
| 2819 | case TCPS_TIME_WAIT: |
| 2820 | TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * tp->t_msl); |
| 2821 | goto dropafterack; |
| 2822 | } |
| 2823 | } |
| 2824 | |
| 2825 | step6: |
| 2826 | /* |
| 2827 | * Update window information. |
| 2828 | * Don't look at window if no ACK: TAC's send garbage on first SYN. |
| 2829 | */ |
| 2830 | if ((tiflags & TH_ACK) && (SEQ_LT(tp->snd_wl1, th->th_seq) || |
| 2831 | (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || |
| 2832 | (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { |
| 2833 | /* keep track of pure window updates */ |
| 2834 | if (tlen == 0 && |
| 2835 | tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) |
| 2836 | TCP_STATINC(TCP_STAT_RCVWINUPD); |
| 2837 | tp->snd_wnd = tiwin; |
| 2838 | tp->snd_wl1 = th->th_seq; |
| 2839 | tp->snd_wl2 = th->th_ack; |
| 2840 | if (tp->snd_wnd > tp->max_sndwnd) |
| 2841 | tp->max_sndwnd = tp->snd_wnd; |
| 2842 | needoutput = 1; |
| 2843 | } |
| 2844 | |
| 2845 | /* |
| 2846 | * Process segments with URG. |
| 2847 | */ |
| 2848 | if ((tiflags & TH_URG) && th->th_urp && |
| 2849 | TCPS_HAVERCVDFIN(tp->t_state) == 0) { |
| 2850 | /* |
| 2851 | * This is a kludge, but if we receive and accept |
| 2852 | * random urgent pointers, we'll crash in |
| 2853 | * soreceive. It's hard to imagine someone |
| 2854 | * actually wanting to send this much urgent data. |
| 2855 | */ |
| 2856 | if (th->th_urp + so->so_rcv.sb_cc > sb_max) { |
| 2857 | th->th_urp = 0; /* XXX */ |
| 2858 | tiflags &= ~TH_URG; /* XXX */ |
| 2859 | goto dodata; /* XXX */ |
| 2860 | } |
| 2861 | /* |
| 2862 | * If this segment advances the known urgent pointer, |
| 2863 | * then mark the data stream. This should not happen |
| 2864 | * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since |
| 2865 | * a FIN has been received from the remote side. |
| 2866 | * In these states we ignore the URG. |
| 2867 | * |
| 2868 | * According to RFC961 (Assigned Protocols), |
| 2869 | * the urgent pointer points to the last octet |
| 2870 | * of urgent data. We continue, however, |
| 2871 | * to consider it to indicate the first octet |
| 2872 | * of data past the urgent section as the original |
| 2873 | * spec states (in one of two places). |
| 2874 | */ |
| 2875 | if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { |
| 2876 | tp->rcv_up = th->th_seq + th->th_urp; |
| 2877 | so->so_oobmark = so->so_rcv.sb_cc + |
| 2878 | (tp->rcv_up - tp->rcv_nxt) - 1; |
| 2879 | if (so->so_oobmark == 0) |
| 2880 | so->so_state |= SS_RCVATMARK; |
| 2881 | sohasoutofband(so); |
| 2882 | tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); |
| 2883 | } |
| 2884 | /* |
| 2885 | * Remove out of band data so doesn't get presented to user. |
| 2886 | * This can happen independent of advancing the URG pointer, |
| 2887 | * but if two URG's are pending at once, some out-of-band |
| 2888 | * data may creep in... ick. |
| 2889 | */ |
| 2890 | if (th->th_urp <= (u_int16_t) tlen |
| 2891 | #ifdef SO_OOBINLINE |
| 2892 | && (so->so_options & SO_OOBINLINE) == 0 |
| 2893 | #endif |
| 2894 | ) |
| 2895 | tcp_pulloutofband(so, th, m, hdroptlen); |
| 2896 | } else |
| 2897 | /* |
| 2898 | * If no out of band data is expected, |
| 2899 | * pull receive urgent pointer along |
| 2900 | * with the receive window. |
| 2901 | */ |
| 2902 | if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) |
| 2903 | tp->rcv_up = tp->rcv_nxt; |
| 2904 | dodata: /* XXX */ |
| 2905 | |
| 2906 | /* |
| 2907 | * Process the segment text, merging it into the TCP sequencing queue, |
| 2908 | * and arranging for acknowledgement of receipt if necessary. |
| 2909 | * This process logically involves adjusting tp->rcv_wnd as data |
| 2910 | * is presented to the user (this happens in tcp_usrreq.c, |
| 2911 | * tcp_rcvd()). If a FIN has already been received on this |
| 2912 | * connection then we just ignore the text. |
| 2913 | */ |
| 2914 | if ((tlen || (tiflags & TH_FIN)) && |
| 2915 | TCPS_HAVERCVDFIN(tp->t_state) == 0) { |
| 2916 | /* |
| 2917 | * Insert segment ti into reassembly queue of tcp with |
| 2918 | * control block tp. Return TH_FIN if reassembly now includes |
| 2919 | * a segment with FIN. The macro form does the common case |
| 2920 | * inline (segment is the next to be received on an |
| 2921 | * established connection, and the queue is empty), |
| 2922 | * avoiding linkage into and removal from the queue and |
| 2923 | * repetition of various conversions. |
| 2924 | * Set DELACK for segments received in order, but ack |
| 2925 | * immediately when segments are out of order |
| 2926 | * (so fast retransmit can work). |
| 2927 | */ |
| 2928 | /* NOTE: this was TCP_REASS() macro, but used only once */ |
| 2929 | TCP_REASS_LOCK(tp); |
| 2930 | if (th->th_seq == tp->rcv_nxt && |
| 2931 | TAILQ_FIRST(&tp->segq) == NULL && |
| 2932 | tp->t_state == TCPS_ESTABLISHED) { |
| 2933 | tcp_setup_ack(tp, th); |
| 2934 | tp->rcv_nxt += tlen; |
| 2935 | tiflags = th->th_flags & TH_FIN; |
| 2936 | tcps = TCP_STAT_GETREF(); |
| 2937 | tcps[TCP_STAT_RCVPACK]++; |
| 2938 | tcps[TCP_STAT_RCVBYTE] += tlen; |
| 2939 | TCP_STAT_PUTREF(); |
| 2940 | nd6_hint(tp); |
| 2941 | if (so->so_state & SS_CANTRCVMORE) |
| 2942 | m_freem(m); |
| 2943 | else { |
| 2944 | m_adj(m, hdroptlen); |
| 2945 | sbappendstream(&(so)->so_rcv, m); |
| 2946 | } |
| 2947 | TCP_REASS_UNLOCK(tp); |
| 2948 | sorwakeup(so); |
| 2949 | } else { |
| 2950 | m_adj(m, hdroptlen); |
| 2951 | tiflags = tcp_reass(tp, th, m, &tlen); |
| 2952 | tp->t_flags |= TF_ACKNOW; |
| 2953 | } |
| 2954 | |
| 2955 | /* |
| 2956 | * Note the amount of data that peer has sent into |
| 2957 | * our window, in order to estimate the sender's |
| 2958 | * buffer size. |
| 2959 | */ |
| 2960 | len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); |
| 2961 | } else { |
| 2962 | m_freem(m); |
| 2963 | m = NULL; |
| 2964 | tiflags &= ~TH_FIN; |
| 2965 | } |
| 2966 | |
| 2967 | /* |
| 2968 | * If FIN is received ACK the FIN and let the user know |
| 2969 | * that the connection is closing. Ignore a FIN received before |
| 2970 | * the connection is fully established. |
| 2971 | */ |
| 2972 | if ((tiflags & TH_FIN) && TCPS_HAVEESTABLISHED(tp->t_state)) { |
| 2973 | if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { |
| 2974 | socantrcvmore(so); |
| 2975 | tp->t_flags |= TF_ACKNOW; |
| 2976 | tp->rcv_nxt++; |
| 2977 | } |
| 2978 | switch (tp->t_state) { |
| 2979 | |
| 2980 | /* |
| 2981 | * In ESTABLISHED STATE enter the CLOSE_WAIT state. |
| 2982 | */ |
| 2983 | case TCPS_ESTABLISHED: |
| 2984 | tp->t_state = TCPS_CLOSE_WAIT; |
| 2985 | break; |
| 2986 | |
| 2987 | /* |
| 2988 | * If still in FIN_WAIT_1 STATE FIN has not been acked so |
| 2989 | * enter the CLOSING state. |
| 2990 | */ |
| 2991 | case TCPS_FIN_WAIT_1: |
| 2992 | tp->t_state = TCPS_CLOSING; |
| 2993 | break; |
| 2994 | |
| 2995 | /* |
| 2996 | * In FIN_WAIT_2 state enter the TIME_WAIT state, |
| 2997 | * starting the time-wait timer, turning off the other |
| 2998 | * standard timers. |
| 2999 | */ |
| 3000 | case TCPS_FIN_WAIT_2: |
| 3001 | tp->t_state = TCPS_TIME_WAIT; |
| 3002 | tcp_canceltimers(tp); |
| 3003 | TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * tp->t_msl); |
| 3004 | soisdisconnected(so); |
| 3005 | break; |
| 3006 | |
| 3007 | /* |
| 3008 | * In TIME_WAIT state restart the 2 MSL time_wait timer. |
| 3009 | */ |
| 3010 | case TCPS_TIME_WAIT: |
| 3011 | TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * tp->t_msl); |
| 3012 | break; |
| 3013 | } |
| 3014 | } |
| 3015 | #ifdef TCP_DEBUG |
| 3016 | if (so->so_options & SO_DEBUG) |
| 3017 | tcp_trace(TA_INPUT, ostate, tp, tcp_saveti, 0); |
| 3018 | #endif |
| 3019 | |
| 3020 | /* |
| 3021 | * Return any desired output. |
| 3022 | */ |
| 3023 | if (needoutput || (tp->t_flags & TF_ACKNOW)) { |
| 3024 | KERNEL_LOCK(1, NULL); |
| 3025 | (void) tcp_output(tp); |
| 3026 | KERNEL_UNLOCK_ONE(NULL); |
| 3027 | } |
| 3028 | if (tcp_saveti) |
| 3029 | m_freem(tcp_saveti); |
| 3030 | |
| 3031 | if (tp->t_state == TCPS_TIME_WAIT |
| 3032 | && (so->so_state & SS_NOFDREF) |
| 3033 | && (tp->t_inpcb || af != AF_INET) |
| 3034 | && (tp->t_in6pcb || af != AF_INET6) |
| 3035 | && ((af == AF_INET ? tcp4_vtw_enable : tcp6_vtw_enable) & 1) != 0 |
| 3036 | && TAILQ_EMPTY(&tp->segq) |
| 3037 | && vtw_add(af, tp)) { |
| 3038 | ; |
| 3039 | } |
| 3040 | return; |
| 3041 | |
| 3042 | badsyn: |
| 3043 | /* |
| 3044 | * Received a bad SYN. Increment counters and dropwithreset. |
| 3045 | */ |
| 3046 | TCP_STATINC(TCP_STAT_BADSYN); |
| 3047 | tp = NULL; |
| 3048 | goto dropwithreset; |
| 3049 | |
| 3050 | dropafterack: |
| 3051 | /* |
| 3052 | * Generate an ACK dropping incoming segment if it occupies |
| 3053 | * sequence space, where the ACK reflects our state. |
| 3054 | */ |
| 3055 | if (tiflags & TH_RST) |
| 3056 | goto drop; |
| 3057 | goto dropafterack2; |
| 3058 | |
| 3059 | dropafterack_ratelim: |
| 3060 | /* |
| 3061 | * We may want to rate-limit ACKs against SYN/RST attack. |
| 3062 | */ |
| 3063 | if (ppsratecheck(&tcp_ackdrop_ppslim_last, &tcp_ackdrop_ppslim_count, |
| 3064 | tcp_ackdrop_ppslim) == 0) { |
| 3065 | /* XXX stat */ |
| 3066 | goto drop; |
| 3067 | } |
| 3068 | /* ...fall into dropafterack2... */ |
| 3069 | |
| 3070 | dropafterack2: |
| 3071 | m_freem(m); |
| 3072 | tp->t_flags |= TF_ACKNOW; |
| 3073 | KERNEL_LOCK(1, NULL); |
| 3074 | (void) tcp_output(tp); |
| 3075 | KERNEL_UNLOCK_ONE(NULL); |
| 3076 | if (tcp_saveti) |
| 3077 | m_freem(tcp_saveti); |
| 3078 | return; |
| 3079 | |
| 3080 | dropwithreset_ratelim: |
| 3081 | /* |
| 3082 | * We may want to rate-limit RSTs in certain situations, |
| 3083 | * particularly if we are sending an RST in response to |
| 3084 | * an attempt to connect to or otherwise communicate with |
| 3085 | * a port for which we have no socket. |
| 3086 | */ |
| 3087 | if (ppsratecheck(&tcp_rst_ppslim_last, &tcp_rst_ppslim_count, |
| 3088 | tcp_rst_ppslim) == 0) { |
| 3089 | /* XXX stat */ |
| 3090 | goto drop; |
| 3091 | } |
| 3092 | /* ...fall into dropwithreset... */ |
| 3093 | |
| 3094 | dropwithreset: |
| 3095 | /* |
| 3096 | * Generate a RST, dropping incoming segment. |
| 3097 | * Make ACK acceptable to originator of segment. |
| 3098 | */ |
| 3099 | if (tiflags & TH_RST) |
| 3100 | goto drop; |
| 3101 | |
| 3102 | switch (af) { |
| 3103 | #ifdef INET6 |
| 3104 | case AF_INET6: |
| 3105 | /* For following calls to tcp_respond */ |
| 3106 | if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) |
| 3107 | goto drop; |
| 3108 | break; |
| 3109 | #endif /* INET6 */ |
| 3110 | case AF_INET: |
| 3111 | if (IN_MULTICAST(ip->ip_dst.s_addr) || |
| 3112 | in_broadcast(ip->ip_dst, m_get_rcvif_NOMPSAFE(m))) |
| 3113 | goto drop; |
| 3114 | } |
| 3115 | |
| 3116 | if (tiflags & TH_ACK) |
| 3117 | (void)tcp_respond(tp, m, m, th, (tcp_seq)0, th->th_ack, TH_RST); |
| 3118 | else { |
| 3119 | if (tiflags & TH_SYN) |
| 3120 | tlen++; |
| 3121 | (void)tcp_respond(tp, m, m, th, th->th_seq + tlen, (tcp_seq)0, |
| 3122 | TH_RST|TH_ACK); |
| 3123 | } |
| 3124 | if (tcp_saveti) |
| 3125 | m_freem(tcp_saveti); |
| 3126 | return; |
| 3127 | |
| 3128 | badcsum: |
| 3129 | drop: |
| 3130 | /* |
| 3131 | * Drop space held by incoming segment and return. |
| 3132 | */ |
| 3133 | if (tp) { |
| 3134 | if (tp->t_inpcb) |
| 3135 | so = tp->t_inpcb->inp_socket; |
| 3136 | #ifdef INET6 |
| 3137 | else if (tp->t_in6pcb) |
| 3138 | so = tp->t_in6pcb->in6p_socket; |
| 3139 | #endif |
| 3140 | else |
| 3141 | so = NULL; |
| 3142 | #ifdef TCP_DEBUG |
| 3143 | if (so && (so->so_options & SO_DEBUG) != 0) |
| 3144 | tcp_trace(TA_DROP, ostate, tp, tcp_saveti, 0); |
| 3145 | #endif |
| 3146 | } |
| 3147 | if (tcp_saveti) |
| 3148 | m_freem(tcp_saveti); |
| 3149 | m_freem(m); |
| 3150 | return; |
| 3151 | } |
| 3152 | |
| 3153 | #ifdef TCP_SIGNATURE |
| 3154 | int |
| 3155 | tcp_signature_apply(void *fstate, void *data, u_int len) |
| 3156 | { |
| 3157 | |
| 3158 | MD5Update(fstate, (u_char *)data, len); |
| 3159 | return (0); |
| 3160 | } |
| 3161 | |
| 3162 | struct secasvar * |
| 3163 | tcp_signature_getsav(struct mbuf *m, struct tcphdr *th) |
| 3164 | { |
| 3165 | struct ip *ip; |
| 3166 | struct ip6_hdr *ip6; |
| 3167 | |
| 3168 | ip = mtod(m, struct ip *); |
| 3169 | switch (ip->ip_v) { |
| 3170 | case 4: |
| 3171 | ip = mtod(m, struct ip *); |
| 3172 | ip6 = NULL; |
| 3173 | break; |
| 3174 | case 6: |
| 3175 | ip = NULL; |
| 3176 | ip6 = mtod(m, struct ip6_hdr *); |
| 3177 | break; |
| 3178 | default: |
| 3179 | return (NULL); |
| 3180 | } |
| 3181 | |
| 3182 | #ifdef IPSEC |
| 3183 | if (ipsec_used) { |
| 3184 | union sockaddr_union dst; |
| 3185 | /* Extract the destination from the IP header in the mbuf. */ |
| 3186 | memset(&dst, 0, sizeof(union sockaddr_union)); |
| 3187 | if (ip != NULL) { |
| 3188 | dst.sa.sa_len = sizeof(struct sockaddr_in); |
| 3189 | dst.sa.sa_family = AF_INET; |
| 3190 | dst.sin.sin_addr = ip->ip_dst; |
| 3191 | } else { |
| 3192 | dst.sa.sa_len = sizeof(struct sockaddr_in6); |
| 3193 | dst.sa.sa_family = AF_INET6; |
| 3194 | dst.sin6.sin6_addr = ip6->ip6_dst; |
| 3195 | } |
| 3196 | |
| 3197 | /* |
| 3198 | * Look up an SADB entry which matches the address of the peer. |
| 3199 | */ |
| 3200 | return KEY_ALLOCSA(&dst, IPPROTO_TCP, htonl(TCP_SIG_SPI), 0, 0); |
| 3201 | } |
| 3202 | return NULL; |
| 3203 | #else |
| 3204 | if (ip) |
| 3205 | return key_allocsa(AF_INET, (void *)&ip->ip_src, |
| 3206 | (void *)&ip->ip_dst, IPPROTO_TCP, |
| 3207 | htonl(TCP_SIG_SPI), 0, 0); |
| 3208 | else |
| 3209 | return key_allocsa(AF_INET6, (void *)&ip6->ip6_src, |
| 3210 | (void *)&ip6->ip6_dst, IPPROTO_TCP, |
| 3211 | htonl(TCP_SIG_SPI), 0, 0); |
| 3212 | #endif |
| 3213 | } |
| 3214 | |
| 3215 | int |
| 3216 | tcp_signature(struct mbuf *m, struct tcphdr *th, int thoff, |
| 3217 | struct secasvar *sav, char *sig) |
| 3218 | { |
| 3219 | MD5_CTX ctx; |
| 3220 | struct ip *ip; |
| 3221 | struct ipovly *ipovly; |
| 3222 | #ifdef INET6 |
| 3223 | struct ip6_hdr *ip6; |
| 3224 | struct ip6_hdr_pseudo ip6pseudo; |
| 3225 | #endif /* INET6 */ |
| 3226 | struct ippseudo ippseudo; |
| 3227 | struct tcphdr th0; |
| 3228 | int l, tcphdrlen; |
| 3229 | |
| 3230 | if (sav == NULL) |
| 3231 | return (-1); |
| 3232 | |
| 3233 | tcphdrlen = th->th_off * 4; |
| 3234 | |
| 3235 | switch (mtod(m, struct ip *)->ip_v) { |
| 3236 | case 4: |
| 3237 | MD5Init(&ctx); |
| 3238 | ip = mtod(m, struct ip *); |
| 3239 | memset(&ippseudo, 0, sizeof(ippseudo)); |
| 3240 | ipovly = (struct ipovly *)ip; |
| 3241 | ippseudo.ippseudo_src = ipovly->ih_src; |
| 3242 | ippseudo.ippseudo_dst = ipovly->ih_dst; |
| 3243 | ippseudo.ippseudo_pad = 0; |
| 3244 | ippseudo.ippseudo_p = IPPROTO_TCP; |
| 3245 | ippseudo.ippseudo_len = htons(m->m_pkthdr.len - thoff); |
| 3246 | MD5Update(&ctx, (char *)&ippseudo, sizeof(ippseudo)); |
| 3247 | break; |
| 3248 | #if INET6 |
| 3249 | case 6: |
| 3250 | MD5Init(&ctx); |
| 3251 | ip6 = mtod(m, struct ip6_hdr *); |
| 3252 | memset(&ip6pseudo, 0, sizeof(ip6pseudo)); |
| 3253 | ip6pseudo.ip6ph_src = ip6->ip6_src; |
| 3254 | in6_clearscope(&ip6pseudo.ip6ph_src); |
| 3255 | ip6pseudo.ip6ph_dst = ip6->ip6_dst; |
| 3256 | in6_clearscope(&ip6pseudo.ip6ph_dst); |
| 3257 | ip6pseudo.ip6ph_len = htons(m->m_pkthdr.len - thoff); |
| 3258 | ip6pseudo.ip6ph_nxt = IPPROTO_TCP; |
| 3259 | MD5Update(&ctx, (char *)&ip6pseudo, sizeof(ip6pseudo)); |
| 3260 | break; |
| 3261 | #endif /* INET6 */ |
| 3262 | default: |
| 3263 | return (-1); |
| 3264 | } |
| 3265 | |
| 3266 | th0 = *th; |
| 3267 | th0.th_sum = 0; |
| 3268 | MD5Update(&ctx, (char *)&th0, sizeof(th0)); |
| 3269 | |
| 3270 | l = m->m_pkthdr.len - thoff - tcphdrlen; |
| 3271 | if (l > 0) |
| 3272 | m_apply(m, thoff + tcphdrlen, |
| 3273 | m->m_pkthdr.len - thoff - tcphdrlen, |
| 3274 | tcp_signature_apply, &ctx); |
| 3275 | |
| 3276 | MD5Update(&ctx, _KEYBUF(sav->key_auth), _KEYLEN(sav->key_auth)); |
| 3277 | MD5Final(sig, &ctx); |
| 3278 | |
| 3279 | return (0); |
| 3280 | } |
| 3281 | #endif |
| 3282 | |
| 3283 | /* |
| 3284 | * tcp_dooptions: parse and process tcp options. |
| 3285 | * |
| 3286 | * returns -1 if this segment should be dropped. (eg. wrong signature) |
| 3287 | * otherwise returns 0. |
| 3288 | */ |
| 3289 | |
| 3290 | static int |
| 3291 | tcp_dooptions(struct tcpcb *tp, const u_char *cp, int cnt, |
| 3292 | struct tcphdr *th, |
| 3293 | struct mbuf *m, int toff, struct tcp_opt_info *oi) |
| 3294 | { |
| 3295 | u_int16_t mss; |
| 3296 | int opt, optlen = 0; |
| 3297 | #ifdef TCP_SIGNATURE |
| 3298 | void *sigp = NULL; |
| 3299 | char sigbuf[TCP_SIGLEN]; |
| 3300 | struct secasvar *sav = NULL; |
| 3301 | #endif |
| 3302 | |
| 3303 | for (; cp && cnt > 0; cnt -= optlen, cp += optlen) { |
| 3304 | opt = cp[0]; |
| 3305 | if (opt == TCPOPT_EOL) |
| 3306 | break; |
| 3307 | if (opt == TCPOPT_NOP) |
| 3308 | optlen = 1; |
| 3309 | else { |
| 3310 | if (cnt < 2) |
| 3311 | break; |
| 3312 | optlen = cp[1]; |
| 3313 | if (optlen < 2 || optlen > cnt) |
| 3314 | break; |
| 3315 | } |
| 3316 | switch (opt) { |
| 3317 | |
| 3318 | default: |
| 3319 | continue; |
| 3320 | |
| 3321 | case TCPOPT_MAXSEG: |
| 3322 | if (optlen != TCPOLEN_MAXSEG) |
| 3323 | continue; |
| 3324 | if (!(th->th_flags & TH_SYN)) |
| 3325 | continue; |
| 3326 | if (TCPS_HAVERCVDSYN(tp->t_state)) |
| 3327 | continue; |
| 3328 | bcopy(cp + 2, &mss, sizeof(mss)); |
| 3329 | oi->maxseg = ntohs(mss); |
| 3330 | break; |
| 3331 | |
| 3332 | case TCPOPT_WINDOW: |
| 3333 | if (optlen != TCPOLEN_WINDOW) |
| 3334 | continue; |
| 3335 | if (!(th->th_flags & TH_SYN)) |
| 3336 | continue; |
| 3337 | if (TCPS_HAVERCVDSYN(tp->t_state)) |
| 3338 | continue; |
| 3339 | tp->t_flags |= TF_RCVD_SCALE; |
| 3340 | tp->requested_s_scale = cp[2]; |
| 3341 | if (tp->requested_s_scale > TCP_MAX_WINSHIFT) { |
| 3342 | char buf[INET6_ADDRSTRLEN]; |
| 3343 | struct ip *ip = mtod(m, struct ip *); |
| 3344 | #ifdef INET6 |
| 3345 | struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); |
| 3346 | #endif |
| 3347 | if (ip) |
| 3348 | in_print(buf, sizeof(buf), |
| 3349 | &ip->ip_src); |
| 3350 | #ifdef INET6 |
| 3351 | else if (ip6) |
| 3352 | in6_print(buf, sizeof(buf), |
| 3353 | &ip6->ip6_src); |
| 3354 | #endif |
| 3355 | else |
| 3356 | strlcpy(buf, "(unknown)" , sizeof(buf)); |
| 3357 | log(LOG_ERR, "TCP: invalid wscale %d from %s, " |
| 3358 | "assuming %d\n" , |
| 3359 | tp->requested_s_scale, buf, |
| 3360 | TCP_MAX_WINSHIFT); |
| 3361 | tp->requested_s_scale = TCP_MAX_WINSHIFT; |
| 3362 | } |
| 3363 | break; |
| 3364 | |
| 3365 | case TCPOPT_TIMESTAMP: |
| 3366 | if (optlen != TCPOLEN_TIMESTAMP) |
| 3367 | continue; |
| 3368 | oi->ts_present = 1; |
| 3369 | bcopy(cp + 2, &oi->ts_val, sizeof(oi->ts_val)); |
| 3370 | NTOHL(oi->ts_val); |
| 3371 | bcopy(cp + 6, &oi->ts_ecr, sizeof(oi->ts_ecr)); |
| 3372 | NTOHL(oi->ts_ecr); |
| 3373 | |
| 3374 | if (!(th->th_flags & TH_SYN)) |
| 3375 | continue; |
| 3376 | if (TCPS_HAVERCVDSYN(tp->t_state)) |
| 3377 | continue; |
| 3378 | /* |
| 3379 | * A timestamp received in a SYN makes |
| 3380 | * it ok to send timestamp requests and replies. |
| 3381 | */ |
| 3382 | tp->t_flags |= TF_RCVD_TSTMP; |
| 3383 | tp->ts_recent = oi->ts_val; |
| 3384 | tp->ts_recent_age = tcp_now; |
| 3385 | break; |
| 3386 | |
| 3387 | case TCPOPT_SACK_PERMITTED: |
| 3388 | if (optlen != TCPOLEN_SACK_PERMITTED) |
| 3389 | continue; |
| 3390 | if (!(th->th_flags & TH_SYN)) |
| 3391 | continue; |
| 3392 | if (TCPS_HAVERCVDSYN(tp->t_state)) |
| 3393 | continue; |
| 3394 | if (tcp_do_sack) { |
| 3395 | tp->t_flags |= TF_SACK_PERMIT; |
| 3396 | tp->t_flags |= TF_WILL_SACK; |
| 3397 | } |
| 3398 | break; |
| 3399 | |
| 3400 | case TCPOPT_SACK: |
| 3401 | tcp_sack_option(tp, th, cp, optlen); |
| 3402 | break; |
| 3403 | #ifdef TCP_SIGNATURE |
| 3404 | case TCPOPT_SIGNATURE: |
| 3405 | if (optlen != TCPOLEN_SIGNATURE) |
| 3406 | continue; |
| 3407 | if (sigp && memcmp(sigp, cp + 2, TCP_SIGLEN)) |
| 3408 | return (-1); |
| 3409 | |
| 3410 | sigp = sigbuf; |
| 3411 | memcpy(sigbuf, cp + 2, TCP_SIGLEN); |
| 3412 | tp->t_flags |= TF_SIGNATURE; |
| 3413 | break; |
| 3414 | #endif |
| 3415 | } |
| 3416 | } |
| 3417 | |
| 3418 | #ifndef TCP_SIGNATURE |
| 3419 | return 0; |
| 3420 | #else |
| 3421 | if (tp->t_flags & TF_SIGNATURE) { |
| 3422 | |
| 3423 | sav = tcp_signature_getsav(m, th); |
| 3424 | |
| 3425 | if (sav == NULL && tp->t_state == TCPS_LISTEN) |
| 3426 | return (-1); |
| 3427 | } |
| 3428 | |
| 3429 | if ((sigp ? TF_SIGNATURE : 0) ^ (tp->t_flags & TF_SIGNATURE)) |
| 3430 | goto out; |
| 3431 | |
| 3432 | if (sigp) { |
| 3433 | char sig[TCP_SIGLEN]; |
| 3434 | |
| 3435 | tcp_fields_to_net(th); |
| 3436 | if (tcp_signature(m, th, toff, sav, sig) < 0) { |
| 3437 | tcp_fields_to_host(th); |
| 3438 | goto out; |
| 3439 | } |
| 3440 | tcp_fields_to_host(th); |
| 3441 | |
| 3442 | if (memcmp(sig, sigp, TCP_SIGLEN)) { |
| 3443 | TCP_STATINC(TCP_STAT_BADSIG); |
| 3444 | goto out; |
| 3445 | } else |
| 3446 | TCP_STATINC(TCP_STAT_GOODSIG); |
| 3447 | |
| 3448 | key_sa_recordxfer(sav, m); |
| 3449 | KEY_FREESAV(&sav); |
| 3450 | } |
| 3451 | return 0; |
| 3452 | out: |
| 3453 | if (sav != NULL) |
| 3454 | KEY_FREESAV(&sav); |
| 3455 | return -1; |
| 3456 | #endif |
| 3457 | } |
| 3458 | |
| 3459 | /* |
| 3460 | * Pull out of band byte out of a segment so |
| 3461 | * it doesn't appear in the user's data queue. |
| 3462 | * It is still reflected in the segment length for |
| 3463 | * sequencing purposes. |
| 3464 | */ |
| 3465 | void |
| 3466 | tcp_pulloutofband(struct socket *so, struct tcphdr *th, |
| 3467 | struct mbuf *m, int off) |
| 3468 | { |
| 3469 | int cnt = off + th->th_urp - 1; |
| 3470 | |
| 3471 | while (cnt >= 0) { |
| 3472 | if (m->m_len > cnt) { |
| 3473 | char *cp = mtod(m, char *) + cnt; |
| 3474 | struct tcpcb *tp = sototcpcb(so); |
| 3475 | |
| 3476 | tp->t_iobc = *cp; |
| 3477 | tp->t_oobflags |= TCPOOB_HAVEDATA; |
| 3478 | bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1)); |
| 3479 | m->m_len--; |
| 3480 | return; |
| 3481 | } |
| 3482 | cnt -= m->m_len; |
| 3483 | m = m->m_next; |
| 3484 | if (m == 0) |
| 3485 | break; |
| 3486 | } |
| 3487 | panic("tcp_pulloutofband" ); |
| 3488 | } |
| 3489 | |
| 3490 | /* |
| 3491 | * Collect new round-trip time estimate |
| 3492 | * and update averages and current timeout. |
| 3493 | * |
| 3494 | * rtt is in units of slow ticks (typically 500 ms) -- essentially the |
| 3495 | * difference of two timestamps. |
| 3496 | */ |
| 3497 | void |
| 3498 | tcp_xmit_timer(struct tcpcb *tp, uint32_t rtt) |
| 3499 | { |
| 3500 | int32_t delta; |
| 3501 | |
| 3502 | TCP_STATINC(TCP_STAT_RTTUPDATED); |
| 3503 | if (tp->t_srtt != 0) { |
| 3504 | /* |
| 3505 | * Compute the amount to add to srtt for smoothing, |
| 3506 | * *alpha, or 2^(-TCP_RTT_SHIFT). Because |
| 3507 | * srtt is stored in 1/32 slow ticks, we conceptually |
| 3508 | * shift left 5 bits, subtract srtt to get the |
| 3509 | * diference, and then shift right by TCP_RTT_SHIFT |
| 3510 | * (3) to obtain 1/8 of the difference. |
| 3511 | */ |
| 3512 | delta = (rtt << 2) - (tp->t_srtt >> TCP_RTT_SHIFT); |
| 3513 | /* |
| 3514 | * This can never happen, because delta's lowest |
| 3515 | * possible value is 1/8 of t_srtt. But if it does, |
| 3516 | * set srtt to some reasonable value, here chosen |
| 3517 | * as 1/8 tick. |
| 3518 | */ |
| 3519 | if ((tp->t_srtt += delta) <= 0) |
| 3520 | tp->t_srtt = 1 << 2; |
| 3521 | /* |
| 3522 | * RFC2988 requires that rttvar be updated first. |
| 3523 | * This code is compliant because "delta" is the old |
| 3524 | * srtt minus the new observation (scaled). |
| 3525 | * |
| 3526 | * RFC2988 says: |
| 3527 | * rttvar = (1-beta) * rttvar + beta * |srtt-observed| |
| 3528 | * |
| 3529 | * delta is in units of 1/32 ticks, and has then been |
| 3530 | * divided by 8. This is equivalent to being in 1/16s |
| 3531 | * units and divided by 4. Subtract from it 1/4 of |
| 3532 | * the existing rttvar to form the (signed) amount to |
| 3533 | * adjust. |
| 3534 | */ |
| 3535 | if (delta < 0) |
| 3536 | delta = -delta; |
| 3537 | delta -= (tp->t_rttvar >> TCP_RTTVAR_SHIFT); |
| 3538 | /* |
| 3539 | * As with srtt, this should never happen. There is |
| 3540 | * no support in RFC2988 for this operation. But 1/4s |
| 3541 | * as rttvar when faced with something arguably wrong |
| 3542 | * is ok. |
| 3543 | */ |
| 3544 | if ((tp->t_rttvar += delta) <= 0) |
| 3545 | tp->t_rttvar = 1 << 2; |
| 3546 | |
| 3547 | /* |
| 3548 | * If srtt exceeds .01 second, ensure we use the 'remote' MSL |
| 3549 | * Problem is: it doesn't work. Disabled by defaulting |
| 3550 | * tcp_rttlocal to 0; see corresponding code in |
| 3551 | * tcp_subr that selects local vs remote in a different way. |
| 3552 | * |
| 3553 | * The static branch prediction hint here should be removed |
| 3554 | * when the rtt estimator is fixed and the rtt_enable code |
| 3555 | * is turned back on. |
| 3556 | */ |
| 3557 | if (__predict_false(tcp_rttlocal) && tcp_msl_enable |
| 3558 | && tp->t_srtt > tcp_msl_remote_threshold |
| 3559 | && tp->t_msl < tcp_msl_remote) { |
| 3560 | tp->t_msl = tcp_msl_remote; |
| 3561 | } |
| 3562 | } else { |
| 3563 | /* |
| 3564 | * This is the first measurement. Per RFC2988, 2.2, |
| 3565 | * set rtt=R and srtt=R/2. |
| 3566 | * For srtt, storage representation is 1/32 ticks, |
| 3567 | * so shift left by 5. |
| 3568 | * For rttvar, storage representation is 1/16 ticks, |
| 3569 | * So shift left by 4, but then right by 1 to halve. |
| 3570 | */ |
| 3571 | tp->t_srtt = rtt << (TCP_RTT_SHIFT + 2); |
| 3572 | tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT + 2 - 1); |
| 3573 | } |
| 3574 | tp->t_rtttime = 0; |
| 3575 | tp->t_rxtshift = 0; |
| 3576 | |
| 3577 | /* |
| 3578 | * the retransmit should happen at rtt + 4 * rttvar. |
| 3579 | * Because of the way we do the smoothing, srtt and rttvar |
| 3580 | * will each average +1/2 tick of bias. When we compute |
| 3581 | * the retransmit timer, we want 1/2 tick of rounding and |
| 3582 | * 1 extra tick because of +-1/2 tick uncertainty in the |
| 3583 | * firing of the timer. The bias will give us exactly the |
| 3584 | * 1.5 tick we need. But, because the bias is |
| 3585 | * statistical, we have to test that we don't drop below |
| 3586 | * the minimum feasible timer (which is 2 ticks). |
| 3587 | */ |
| 3588 | TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), |
| 3589 | max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX); |
| 3590 | |
| 3591 | /* |
| 3592 | * We received an ack for a packet that wasn't retransmitted; |
| 3593 | * it is probably safe to discard any error indications we've |
| 3594 | * received recently. This isn't quite right, but close enough |
| 3595 | * for now (a route might have failed after we sent a segment, |
| 3596 | * and the return path might not be symmetrical). |
| 3597 | */ |
| 3598 | tp->t_softerror = 0; |
| 3599 | } |
| 3600 | |
| 3601 | |
| 3602 | /* |
| 3603 | * TCP compressed state engine. Currently used to hold compressed |
| 3604 | * state for SYN_RECEIVED. |
| 3605 | */ |
| 3606 | |
| 3607 | u_long syn_cache_count; |
| 3608 | u_int32_t syn_hash1, syn_hash2; |
| 3609 | |
| 3610 | #define SYN_HASH(sa, sp, dp) \ |
| 3611 | ((((sa)->s_addr^syn_hash1)*(((((u_int32_t)(dp))<<16) + \ |
| 3612 | ((u_int32_t)(sp)))^syn_hash2))) |
| 3613 | #ifndef INET6 |
| 3614 | #define SYN_HASHALL(hash, src, dst) \ |
| 3615 | do { \ |
| 3616 | hash = SYN_HASH(&((const struct sockaddr_in *)(src))->sin_addr, \ |
| 3617 | ((const struct sockaddr_in *)(src))->sin_port, \ |
| 3618 | ((const struct sockaddr_in *)(dst))->sin_port); \ |
| 3619 | } while (/*CONSTCOND*/ 0) |
| 3620 | #else |
| 3621 | #define SYN_HASH6(sa, sp, dp) \ |
| 3622 | ((((sa)->s6_addr32[0] ^ (sa)->s6_addr32[3] ^ syn_hash1) * \ |
| 3623 | (((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp)))^syn_hash2)) \ |
| 3624 | & 0x7fffffff) |
| 3625 | |
| 3626 | #define SYN_HASHALL(hash, src, dst) \ |
| 3627 | do { \ |
| 3628 | switch ((src)->sa_family) { \ |
| 3629 | case AF_INET: \ |
| 3630 | hash = SYN_HASH(&((const struct sockaddr_in *)(src))->sin_addr, \ |
| 3631 | ((const struct sockaddr_in *)(src))->sin_port, \ |
| 3632 | ((const struct sockaddr_in *)(dst))->sin_port); \ |
| 3633 | break; \ |
| 3634 | case AF_INET6: \ |
| 3635 | hash = SYN_HASH6(&((const struct sockaddr_in6 *)(src))->sin6_addr, \ |
| 3636 | ((const struct sockaddr_in6 *)(src))->sin6_port, \ |
| 3637 | ((const struct sockaddr_in6 *)(dst))->sin6_port); \ |
| 3638 | break; \ |
| 3639 | default: \ |
| 3640 | hash = 0; \ |
| 3641 | } \ |
| 3642 | } while (/*CONSTCOND*/0) |
| 3643 | #endif /* INET6 */ |
| 3644 | |
| 3645 | static struct pool syn_cache_pool; |
| 3646 | |
| 3647 | /* |
| 3648 | * We don't estimate RTT with SYNs, so each packet starts with the default |
| 3649 | * RTT and each timer step has a fixed timeout value. |
| 3650 | */ |
| 3651 | #define SYN_CACHE_TIMER_ARM(sc) \ |
| 3652 | do { \ |
| 3653 | TCPT_RANGESET((sc)->sc_rxtcur, \ |
| 3654 | TCPTV_SRTTDFLT * tcp_backoff[(sc)->sc_rxtshift], TCPTV_MIN, \ |
| 3655 | TCPTV_REXMTMAX); \ |
| 3656 | callout_reset(&(sc)->sc_timer, \ |
| 3657 | (sc)->sc_rxtcur * (hz / PR_SLOWHZ), syn_cache_timer, (sc)); \ |
| 3658 | } while (/*CONSTCOND*/0) |
| 3659 | |
| 3660 | #define SYN_CACHE_TIMESTAMP(sc) (tcp_now - (sc)->sc_timebase) |
| 3661 | |
| 3662 | static inline void |
| 3663 | syn_cache_rm(struct syn_cache *sc) |
| 3664 | { |
| 3665 | TAILQ_REMOVE(&tcp_syn_cache[sc->sc_bucketidx].sch_bucket, |
| 3666 | sc, sc_bucketq); |
| 3667 | sc->sc_tp = NULL; |
| 3668 | LIST_REMOVE(sc, sc_tpq); |
| 3669 | tcp_syn_cache[sc->sc_bucketidx].sch_length--; |
| 3670 | callout_stop(&sc->sc_timer); |
| 3671 | syn_cache_count--; |
| 3672 | } |
| 3673 | |
| 3674 | static inline void |
| 3675 | syn_cache_put(struct syn_cache *sc) |
| 3676 | { |
| 3677 | if (sc->sc_ipopts) |
| 3678 | (void) m_free(sc->sc_ipopts); |
| 3679 | rtcache_free(&sc->sc_route); |
| 3680 | sc->sc_flags |= SCF_DEAD; |
| 3681 | if (!callout_invoking(&sc->sc_timer)) |
| 3682 | callout_schedule(&(sc)->sc_timer, 1); |
| 3683 | } |
| 3684 | |
| 3685 | void |
| 3686 | syn_cache_init(void) |
| 3687 | { |
| 3688 | int i; |
| 3689 | |
| 3690 | pool_init(&syn_cache_pool, sizeof(struct syn_cache), 0, 0, 0, |
| 3691 | "synpl" , NULL, IPL_SOFTNET); |
| 3692 | |
| 3693 | /* Initialize the hash buckets. */ |
| 3694 | for (i = 0; i < tcp_syn_cache_size; i++) |
| 3695 | TAILQ_INIT(&tcp_syn_cache[i].sch_bucket); |
| 3696 | } |
| 3697 | |
| 3698 | void |
| 3699 | syn_cache_insert(struct syn_cache *sc, struct tcpcb *tp) |
| 3700 | { |
| 3701 | struct syn_cache_head *scp; |
| 3702 | struct syn_cache *sc2; |
| 3703 | int s; |
| 3704 | |
| 3705 | /* |
| 3706 | * If there are no entries in the hash table, reinitialize |
| 3707 | * the hash secrets. |
| 3708 | */ |
| 3709 | if (syn_cache_count == 0) { |
| 3710 | syn_hash1 = cprng_fast32(); |
| 3711 | syn_hash2 = cprng_fast32(); |
| 3712 | } |
| 3713 | |
| 3714 | SYN_HASHALL(sc->sc_hash, &sc->sc_src.sa, &sc->sc_dst.sa); |
| 3715 | sc->sc_bucketidx = sc->sc_hash % tcp_syn_cache_size; |
| 3716 | scp = &tcp_syn_cache[sc->sc_bucketidx]; |
| 3717 | |
| 3718 | /* |
| 3719 | * Make sure that we don't overflow the per-bucket |
| 3720 | * limit or the total cache size limit. |
| 3721 | */ |
| 3722 | s = splsoftnet(); |
| 3723 | if (scp->sch_length >= tcp_syn_bucket_limit) { |
| 3724 | TCP_STATINC(TCP_STAT_SC_BUCKETOVERFLOW); |
| 3725 | /* |
| 3726 | * The bucket is full. Toss the oldest element in the |
| 3727 | * bucket. This will be the first entry in the bucket. |
| 3728 | */ |
| 3729 | sc2 = TAILQ_FIRST(&scp->sch_bucket); |
| 3730 | #ifdef DIAGNOSTIC |
| 3731 | /* |
| 3732 | * This should never happen; we should always find an |
| 3733 | * entry in our bucket. |
| 3734 | */ |
| 3735 | if (sc2 == NULL) |
| 3736 | panic("syn_cache_insert: bucketoverflow: impossible" ); |
| 3737 | #endif |
| 3738 | syn_cache_rm(sc2); |
| 3739 | syn_cache_put(sc2); /* calls pool_put but see spl above */ |
| 3740 | } else if (syn_cache_count >= tcp_syn_cache_limit) { |
| 3741 | struct syn_cache_head *scp2, *sce; |
| 3742 | |
| 3743 | TCP_STATINC(TCP_STAT_SC_OVERFLOWED); |
| 3744 | /* |
| 3745 | * The cache is full. Toss the oldest entry in the |
| 3746 | * first non-empty bucket we can find. |
| 3747 | * |
| 3748 | * XXX We would really like to toss the oldest |
| 3749 | * entry in the cache, but we hope that this |
| 3750 | * condition doesn't happen very often. |
| 3751 | */ |
| 3752 | scp2 = scp; |
| 3753 | if (TAILQ_EMPTY(&scp2->sch_bucket)) { |
| 3754 | sce = &tcp_syn_cache[tcp_syn_cache_size]; |
| 3755 | for (++scp2; scp2 != scp; scp2++) { |
| 3756 | if (scp2 >= sce) |
| 3757 | scp2 = &tcp_syn_cache[0]; |
| 3758 | if (! TAILQ_EMPTY(&scp2->sch_bucket)) |
| 3759 | break; |
| 3760 | } |
| 3761 | #ifdef DIAGNOSTIC |
| 3762 | /* |
| 3763 | * This should never happen; we should always find a |
| 3764 | * non-empty bucket. |
| 3765 | */ |
| 3766 | if (scp2 == scp) |
| 3767 | panic("syn_cache_insert: cacheoverflow: " |
| 3768 | "impossible" ); |
| 3769 | #endif |
| 3770 | } |
| 3771 | sc2 = TAILQ_FIRST(&scp2->sch_bucket); |
| 3772 | syn_cache_rm(sc2); |
| 3773 | syn_cache_put(sc2); /* calls pool_put but see spl above */ |
| 3774 | } |
| 3775 | |
| 3776 | /* |
| 3777 | * Initialize the entry's timer. |
| 3778 | */ |
| 3779 | sc->sc_rxttot = 0; |
| 3780 | sc->sc_rxtshift = 0; |
| 3781 | SYN_CACHE_TIMER_ARM(sc); |
| 3782 | |
| 3783 | /* Link it from tcpcb entry */ |
| 3784 | LIST_INSERT_HEAD(&tp->t_sc, sc, sc_tpq); |
| 3785 | |
| 3786 | /* Put it into the bucket. */ |
| 3787 | TAILQ_INSERT_TAIL(&scp->sch_bucket, sc, sc_bucketq); |
| 3788 | scp->sch_length++; |
| 3789 | syn_cache_count++; |
| 3790 | |
| 3791 | TCP_STATINC(TCP_STAT_SC_ADDED); |
| 3792 | splx(s); |
| 3793 | } |
| 3794 | |
| 3795 | /* |
| 3796 | * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted. |
| 3797 | * If we have retransmitted an entry the maximum number of times, expire |
| 3798 | * that entry. |
| 3799 | */ |
| 3800 | void |
| 3801 | syn_cache_timer(void *arg) |
| 3802 | { |
| 3803 | struct syn_cache *sc = arg; |
| 3804 | |
| 3805 | mutex_enter(softnet_lock); |
| 3806 | KERNEL_LOCK(1, NULL); |
| 3807 | callout_ack(&sc->sc_timer); |
| 3808 | |
| 3809 | if (__predict_false(sc->sc_flags & SCF_DEAD)) { |
| 3810 | TCP_STATINC(TCP_STAT_SC_DELAYED_FREE); |
| 3811 | callout_destroy(&sc->sc_timer); |
| 3812 | pool_put(&syn_cache_pool, sc); |
| 3813 | KERNEL_UNLOCK_ONE(NULL); |
| 3814 | mutex_exit(softnet_lock); |
| 3815 | return; |
| 3816 | } |
| 3817 | |
| 3818 | if (__predict_false(sc->sc_rxtshift == TCP_MAXRXTSHIFT)) { |
| 3819 | /* Drop it -- too many retransmissions. */ |
| 3820 | goto dropit; |
| 3821 | } |
| 3822 | |
| 3823 | /* |
| 3824 | * Compute the total amount of time this entry has |
| 3825 | * been on a queue. If this entry has been on longer |
| 3826 | * than the keep alive timer would allow, expire it. |
| 3827 | */ |
| 3828 | sc->sc_rxttot += sc->sc_rxtcur; |
| 3829 | if (sc->sc_rxttot >= tcp_keepinit) |
| 3830 | goto dropit; |
| 3831 | |
| 3832 | TCP_STATINC(TCP_STAT_SC_RETRANSMITTED); |
| 3833 | (void) syn_cache_respond(sc, NULL); |
| 3834 | |
| 3835 | /* Advance the timer back-off. */ |
| 3836 | sc->sc_rxtshift++; |
| 3837 | SYN_CACHE_TIMER_ARM(sc); |
| 3838 | |
| 3839 | KERNEL_UNLOCK_ONE(NULL); |
| 3840 | mutex_exit(softnet_lock); |
| 3841 | return; |
| 3842 | |
| 3843 | dropit: |
| 3844 | TCP_STATINC(TCP_STAT_SC_TIMED_OUT); |
| 3845 | syn_cache_rm(sc); |
| 3846 | if (sc->sc_ipopts) |
| 3847 | (void) m_free(sc->sc_ipopts); |
| 3848 | rtcache_free(&sc->sc_route); |
| 3849 | callout_destroy(&sc->sc_timer); |
| 3850 | pool_put(&syn_cache_pool, sc); |
| 3851 | KERNEL_UNLOCK_ONE(NULL); |
| 3852 | mutex_exit(softnet_lock); |
| 3853 | } |
| 3854 | |
| 3855 | /* |
| 3856 | * Remove syn cache created by the specified tcb entry, |
| 3857 | * because this does not make sense to keep them |
| 3858 | * (if there's no tcb entry, syn cache entry will never be used) |
| 3859 | */ |
| 3860 | void |
| 3861 | syn_cache_cleanup(struct tcpcb *tp) |
| 3862 | { |
| 3863 | struct syn_cache *sc, *nsc; |
| 3864 | int s; |
| 3865 | |
| 3866 | s = splsoftnet(); |
| 3867 | |
| 3868 | for (sc = LIST_FIRST(&tp->t_sc); sc != NULL; sc = nsc) { |
| 3869 | nsc = LIST_NEXT(sc, sc_tpq); |
| 3870 | |
| 3871 | #ifdef DIAGNOSTIC |
| 3872 | if (sc->sc_tp != tp) |
| 3873 | panic("invalid sc_tp in syn_cache_cleanup" ); |
| 3874 | #endif |
| 3875 | syn_cache_rm(sc); |
| 3876 | syn_cache_put(sc); /* calls pool_put but see spl above */ |
| 3877 | } |
| 3878 | /* just for safety */ |
| 3879 | LIST_INIT(&tp->t_sc); |
| 3880 | |
| 3881 | splx(s); |
| 3882 | } |
| 3883 | |
| 3884 | /* |
| 3885 | * Find an entry in the syn cache. |
| 3886 | */ |
| 3887 | struct syn_cache * |
| 3888 | syn_cache_lookup(const struct sockaddr *src, const struct sockaddr *dst, |
| 3889 | struct syn_cache_head **headp) |
| 3890 | { |
| 3891 | struct syn_cache *sc; |
| 3892 | struct syn_cache_head *scp; |
| 3893 | u_int32_t hash; |
| 3894 | int s; |
| 3895 | |
| 3896 | SYN_HASHALL(hash, src, dst); |
| 3897 | |
| 3898 | scp = &tcp_syn_cache[hash % tcp_syn_cache_size]; |
| 3899 | *headp = scp; |
| 3900 | s = splsoftnet(); |
| 3901 | for (sc = TAILQ_FIRST(&scp->sch_bucket); sc != NULL; |
| 3902 | sc = TAILQ_NEXT(sc, sc_bucketq)) { |
| 3903 | if (sc->sc_hash != hash) |
| 3904 | continue; |
| 3905 | if (!memcmp(&sc->sc_src, src, src->sa_len) && |
| 3906 | !memcmp(&sc->sc_dst, dst, dst->sa_len)) { |
| 3907 | splx(s); |
| 3908 | return (sc); |
| 3909 | } |
| 3910 | } |
| 3911 | splx(s); |
| 3912 | return (NULL); |
| 3913 | } |
| 3914 | |
| 3915 | /* |
| 3916 | * This function gets called when we receive an ACK for a |
| 3917 | * socket in the LISTEN state. We look up the connection |
| 3918 | * in the syn cache, and if its there, we pull it out of |
| 3919 | * the cache and turn it into a full-blown connection in |
| 3920 | * the SYN-RECEIVED state. |
| 3921 | * |
| 3922 | * The return values may not be immediately obvious, and their effects |
| 3923 | * can be subtle, so here they are: |
| 3924 | * |
| 3925 | * NULL SYN was not found in cache; caller should drop the |
| 3926 | * packet and send an RST. |
| 3927 | * |
| 3928 | * -1 We were unable to create the new connection, and are |
| 3929 | * aborting it. An ACK,RST is being sent to the peer |
| 3930 | * (unless we got screwey sequence numbners; see below), |
| 3931 | * because the 3-way handshake has been completed. Caller |
| 3932 | * should not free the mbuf, since we may be using it. If |
| 3933 | * we are not, we will free it. |
| 3934 | * |
| 3935 | * Otherwise, the return value is a pointer to the new socket |
| 3936 | * associated with the connection. |
| 3937 | */ |
| 3938 | struct socket * |
| 3939 | syn_cache_get(struct sockaddr *src, struct sockaddr *dst, |
| 3940 | struct tcphdr *th, unsigned int hlen, unsigned int tlen, |
| 3941 | struct socket *so, struct mbuf *m) |
| 3942 | { |
| 3943 | struct syn_cache *sc; |
| 3944 | struct syn_cache_head *scp; |
| 3945 | struct inpcb *inp = NULL; |
| 3946 | #ifdef INET6 |
| 3947 | struct in6pcb *in6p = NULL; |
| 3948 | #endif |
| 3949 | struct tcpcb *tp = 0; |
| 3950 | int s; |
| 3951 | struct socket *oso; |
| 3952 | |
| 3953 | s = splsoftnet(); |
| 3954 | if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) { |
| 3955 | splx(s); |
| 3956 | return (NULL); |
| 3957 | } |
| 3958 | |
| 3959 | /* |
| 3960 | * Verify the sequence and ack numbers. Try getting the correct |
| 3961 | * response again. |
| 3962 | */ |
| 3963 | if ((th->th_ack != sc->sc_iss + 1) || |
| 3964 | SEQ_LEQ(th->th_seq, sc->sc_irs) || |
| 3965 | SEQ_GT(th->th_seq, sc->sc_irs + 1 + sc->sc_win)) { |
| 3966 | (void) syn_cache_respond(sc, m); |
| 3967 | splx(s); |
| 3968 | return ((struct socket *)(-1)); |
| 3969 | } |
| 3970 | |
| 3971 | /* Remove this cache entry */ |
| 3972 | syn_cache_rm(sc); |
| 3973 | splx(s); |
| 3974 | |
| 3975 | /* |
| 3976 | * Ok, create the full blown connection, and set things up |
| 3977 | * as they would have been set up if we had created the |
| 3978 | * connection when the SYN arrived. If we can't create |
| 3979 | * the connection, abort it. |
| 3980 | */ |
| 3981 | /* |
| 3982 | * inp still has the OLD in_pcb stuff, set the |
| 3983 | * v6-related flags on the new guy, too. This is |
| 3984 | * done particularly for the case where an AF_INET6 |
| 3985 | * socket is bound only to a port, and a v4 connection |
| 3986 | * comes in on that port. |
| 3987 | * we also copy the flowinfo from the original pcb |
| 3988 | * to the new one. |
| 3989 | */ |
| 3990 | oso = so; |
| 3991 | so = sonewconn(so, true); |
| 3992 | if (so == NULL) |
| 3993 | goto resetandabort; |
| 3994 | |
| 3995 | switch (so->so_proto->pr_domain->dom_family) { |
| 3996 | #ifdef INET |
| 3997 | case AF_INET: |
| 3998 | inp = sotoinpcb(so); |
| 3999 | break; |
| 4000 | #endif |
| 4001 | #ifdef INET6 |
| 4002 | case AF_INET6: |
| 4003 | in6p = sotoin6pcb(so); |
| 4004 | break; |
| 4005 | #endif |
| 4006 | } |
| 4007 | switch (src->sa_family) { |
| 4008 | #ifdef INET |
| 4009 | case AF_INET: |
| 4010 | if (inp) { |
| 4011 | inp->inp_laddr = ((struct sockaddr_in *)dst)->sin_addr; |
| 4012 | inp->inp_lport = ((struct sockaddr_in *)dst)->sin_port; |
| 4013 | inp->inp_options = ip_srcroute(); |
| 4014 | in_pcbstate(inp, INP_BOUND); |
| 4015 | if (inp->inp_options == NULL) { |
| 4016 | inp->inp_options = sc->sc_ipopts; |
| 4017 | sc->sc_ipopts = NULL; |
| 4018 | } |
| 4019 | } |
| 4020 | #ifdef INET6 |
| 4021 | else if (in6p) { |
| 4022 | /* IPv4 packet to AF_INET6 socket */ |
| 4023 | memset(&in6p->in6p_laddr, 0, sizeof(in6p->in6p_laddr)); |
| 4024 | in6p->in6p_laddr.s6_addr16[5] = htons(0xffff); |
| 4025 | bcopy(&((struct sockaddr_in *)dst)->sin_addr, |
| 4026 | &in6p->in6p_laddr.s6_addr32[3], |
| 4027 | sizeof(((struct sockaddr_in *)dst)->sin_addr)); |
| 4028 | in6p->in6p_lport = ((struct sockaddr_in *)dst)->sin_port; |
| 4029 | in6totcpcb(in6p)->t_family = AF_INET; |
| 4030 | if (sotoin6pcb(oso)->in6p_flags & IN6P_IPV6_V6ONLY) |
| 4031 | in6p->in6p_flags |= IN6P_IPV6_V6ONLY; |
| 4032 | else |
| 4033 | in6p->in6p_flags &= ~IN6P_IPV6_V6ONLY; |
| 4034 | in6_pcbstate(in6p, IN6P_BOUND); |
| 4035 | } |
| 4036 | #endif |
| 4037 | break; |
| 4038 | #endif |
| 4039 | #ifdef INET6 |
| 4040 | case AF_INET6: |
| 4041 | if (in6p) { |
| 4042 | in6p->in6p_laddr = ((struct sockaddr_in6 *)dst)->sin6_addr; |
| 4043 | in6p->in6p_lport = ((struct sockaddr_in6 *)dst)->sin6_port; |
| 4044 | in6_pcbstate(in6p, IN6P_BOUND); |
| 4045 | } |
| 4046 | break; |
| 4047 | #endif |
| 4048 | } |
| 4049 | #ifdef INET6 |
| 4050 | if (in6p && in6totcpcb(in6p)->t_family == AF_INET6 && sotoinpcb(oso)) { |
| 4051 | struct in6pcb *oin6p = sotoin6pcb(oso); |
| 4052 | /* inherit socket options from the listening socket */ |
| 4053 | in6p->in6p_flags |= (oin6p->in6p_flags & IN6P_CONTROLOPTS); |
| 4054 | if (in6p->in6p_flags & IN6P_CONTROLOPTS) { |
| 4055 | m_freem(in6p->in6p_options); |
| 4056 | in6p->in6p_options = 0; |
| 4057 | } |
| 4058 | ip6_savecontrol(in6p, &in6p->in6p_options, |
| 4059 | mtod(m, struct ip6_hdr *), m); |
| 4060 | } |
| 4061 | #endif |
| 4062 | |
| 4063 | #if defined(IPSEC) |
| 4064 | if (ipsec_used) { |
| 4065 | /* |
| 4066 | * we make a copy of policy, instead of sharing the policy, for |
| 4067 | * better behavior in terms of SA lookup and dead SA removal. |
| 4068 | */ |
| 4069 | if (inp) { |
| 4070 | /* copy old policy into new socket's */ |
| 4071 | if (ipsec_copy_pcbpolicy(sotoinpcb(oso)->inp_sp, |
| 4072 | inp->inp_sp)) |
| 4073 | printf("tcp_input: could not copy policy\n" ); |
| 4074 | } |
| 4075 | #ifdef INET6 |
| 4076 | else if (in6p) { |
| 4077 | /* copy old policy into new socket's */ |
| 4078 | if (ipsec_copy_pcbpolicy(sotoin6pcb(oso)->in6p_sp, |
| 4079 | in6p->in6p_sp)) |
| 4080 | printf("tcp_input: could not copy policy\n" ); |
| 4081 | } |
| 4082 | #endif |
| 4083 | } |
| 4084 | #endif |
| 4085 | |
| 4086 | /* |
| 4087 | * Give the new socket our cached route reference. |
| 4088 | */ |
| 4089 | if (inp) { |
| 4090 | rtcache_copy(&inp->inp_route, &sc->sc_route); |
| 4091 | rtcache_free(&sc->sc_route); |
| 4092 | } |
| 4093 | #ifdef INET6 |
| 4094 | else { |
| 4095 | rtcache_copy(&in6p->in6p_route, &sc->sc_route); |
| 4096 | rtcache_free(&sc->sc_route); |
| 4097 | } |
| 4098 | #endif |
| 4099 | |
| 4100 | if (inp) { |
| 4101 | struct sockaddr_in sin; |
| 4102 | memcpy(&sin, src, src->sa_len); |
| 4103 | if (in_pcbconnect(inp, &sin, &lwp0)) { |
| 4104 | goto resetandabort; |
| 4105 | } |
| 4106 | } |
| 4107 | #ifdef INET6 |
| 4108 | else if (in6p) { |
| 4109 | struct sockaddr_in6 sin6; |
| 4110 | memcpy(&sin6, src, src->sa_len); |
| 4111 | if (src->sa_family == AF_INET) { |
| 4112 | /* IPv4 packet to AF_INET6 socket */ |
| 4113 | in6_sin_2_v4mapsin6((struct sockaddr_in *)src, &sin6); |
| 4114 | } |
| 4115 | if (in6_pcbconnect(in6p, &sin6, NULL)) { |
| 4116 | goto resetandabort; |
| 4117 | } |
| 4118 | } |
| 4119 | #endif |
| 4120 | else { |
| 4121 | goto resetandabort; |
| 4122 | } |
| 4123 | |
| 4124 | if (inp) |
| 4125 | tp = intotcpcb(inp); |
| 4126 | #ifdef INET6 |
| 4127 | else if (in6p) |
| 4128 | tp = in6totcpcb(in6p); |
| 4129 | #endif |
| 4130 | else |
| 4131 | tp = NULL; |
| 4132 | tp->t_flags = sototcpcb(oso)->t_flags & TF_NODELAY; |
| 4133 | if (sc->sc_request_r_scale != 15) { |
| 4134 | tp->requested_s_scale = sc->sc_requested_s_scale; |
| 4135 | tp->request_r_scale = sc->sc_request_r_scale; |
| 4136 | tp->snd_scale = sc->sc_requested_s_scale; |
| 4137 | tp->rcv_scale = sc->sc_request_r_scale; |
| 4138 | tp->t_flags |= TF_REQ_SCALE|TF_RCVD_SCALE; |
| 4139 | } |
| 4140 | if (sc->sc_flags & SCF_TIMESTAMP) |
| 4141 | tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP; |
| 4142 | tp->ts_timebase = sc->sc_timebase; |
| 4143 | |
| 4144 | tp->t_template = tcp_template(tp); |
| 4145 | if (tp->t_template == 0) { |
| 4146 | tp = tcp_drop(tp, ENOBUFS); /* destroys socket */ |
| 4147 | so = NULL; |
| 4148 | m_freem(m); |
| 4149 | goto abort; |
| 4150 | } |
| 4151 | |
| 4152 | tp->iss = sc->sc_iss; |
| 4153 | tp->irs = sc->sc_irs; |
| 4154 | tcp_sendseqinit(tp); |
| 4155 | tcp_rcvseqinit(tp); |
| 4156 | tp->t_state = TCPS_SYN_RECEIVED; |
| 4157 | TCP_TIMER_ARM(tp, TCPT_KEEP, tp->t_keepinit); |
| 4158 | TCP_STATINC(TCP_STAT_ACCEPTS); |
| 4159 | |
| 4160 | if ((sc->sc_flags & SCF_SACK_PERMIT) && tcp_do_sack) |
| 4161 | tp->t_flags |= TF_WILL_SACK; |
| 4162 | |
| 4163 | if ((sc->sc_flags & SCF_ECN_PERMIT) && tcp_do_ecn) |
| 4164 | tp->t_flags |= TF_ECN_PERMIT; |
| 4165 | |
| 4166 | #ifdef TCP_SIGNATURE |
| 4167 | if (sc->sc_flags & SCF_SIGNATURE) |
| 4168 | tp->t_flags |= TF_SIGNATURE; |
| 4169 | #endif |
| 4170 | |
| 4171 | /* Initialize tp->t_ourmss before we deal with the peer's! */ |
| 4172 | tp->t_ourmss = sc->sc_ourmaxseg; |
| 4173 | tcp_mss_from_peer(tp, sc->sc_peermaxseg); |
| 4174 | |
| 4175 | /* |
| 4176 | * Initialize the initial congestion window. If we |
| 4177 | * had to retransmit the SYN,ACK, we must initialize cwnd |
| 4178 | * to 1 segment (i.e. the Loss Window). |
| 4179 | */ |
| 4180 | if (sc->sc_rxtshift) |
| 4181 | tp->snd_cwnd = tp->t_peermss; |
| 4182 | else { |
| 4183 | int ss = tcp_init_win; |
| 4184 | #ifdef INET |
| 4185 | if (inp != NULL && in_localaddr(inp->inp_faddr)) |
| 4186 | ss = tcp_init_win_local; |
| 4187 | #endif |
| 4188 | #ifdef INET6 |
| 4189 | if (in6p != NULL && in6_localaddr(&in6p->in6p_faddr)) |
| 4190 | ss = tcp_init_win_local; |
| 4191 | #endif |
| 4192 | tp->snd_cwnd = TCP_INITIAL_WINDOW(ss, tp->t_peermss); |
| 4193 | } |
| 4194 | |
| 4195 | tcp_rmx_rtt(tp); |
| 4196 | tp->snd_wl1 = sc->sc_irs; |
| 4197 | tp->rcv_up = sc->sc_irs + 1; |
| 4198 | |
| 4199 | /* |
| 4200 | * This is what whould have happened in tcp_output() when |
| 4201 | * the SYN,ACK was sent. |
| 4202 | */ |
| 4203 | tp->snd_up = tp->snd_una; |
| 4204 | tp->snd_max = tp->snd_nxt = tp->iss+1; |
| 4205 | TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); |
| 4206 | if (sc->sc_win > 0 && SEQ_GT(tp->rcv_nxt + sc->sc_win, tp->rcv_adv)) |
| 4207 | tp->rcv_adv = tp->rcv_nxt + sc->sc_win; |
| 4208 | tp->last_ack_sent = tp->rcv_nxt; |
| 4209 | tp->t_partialacks = -1; |
| 4210 | tp->t_dupacks = 0; |
| 4211 | |
| 4212 | TCP_STATINC(TCP_STAT_SC_COMPLETED); |
| 4213 | s = splsoftnet(); |
| 4214 | syn_cache_put(sc); |
| 4215 | splx(s); |
| 4216 | return (so); |
| 4217 | |
| 4218 | resetandabort: |
| 4219 | (void)tcp_respond(NULL, m, m, th, (tcp_seq)0, th->th_ack, TH_RST); |
| 4220 | abort: |
| 4221 | if (so != NULL) { |
| 4222 | (void) soqremque(so, 1); |
| 4223 | (void) soabort(so); |
| 4224 | mutex_enter(softnet_lock); |
| 4225 | } |
| 4226 | s = splsoftnet(); |
| 4227 | syn_cache_put(sc); |
| 4228 | splx(s); |
| 4229 | TCP_STATINC(TCP_STAT_SC_ABORTED); |
| 4230 | return ((struct socket *)(-1)); |
| 4231 | } |
| 4232 | |
| 4233 | /* |
| 4234 | * This function is called when we get a RST for a |
| 4235 | * non-existent connection, so that we can see if the |
| 4236 | * connection is in the syn cache. If it is, zap it. |
| 4237 | */ |
| 4238 | |
| 4239 | void |
| 4240 | syn_cache_reset(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th) |
| 4241 | { |
| 4242 | struct syn_cache *sc; |
| 4243 | struct syn_cache_head *scp; |
| 4244 | int s = splsoftnet(); |
| 4245 | |
| 4246 | if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) { |
| 4247 | splx(s); |
| 4248 | return; |
| 4249 | } |
| 4250 | if (SEQ_LT(th->th_seq, sc->sc_irs) || |
| 4251 | SEQ_GT(th->th_seq, sc->sc_irs+1)) { |
| 4252 | splx(s); |
| 4253 | return; |
| 4254 | } |
| 4255 | syn_cache_rm(sc); |
| 4256 | TCP_STATINC(TCP_STAT_SC_RESET); |
| 4257 | syn_cache_put(sc); /* calls pool_put but see spl above */ |
| 4258 | splx(s); |
| 4259 | } |
| 4260 | |
| 4261 | void |
| 4262 | syn_cache_unreach(const struct sockaddr *src, const struct sockaddr *dst, |
| 4263 | struct tcphdr *th) |
| 4264 | { |
| 4265 | struct syn_cache *sc; |
| 4266 | struct syn_cache_head *scp; |
| 4267 | int s; |
| 4268 | |
| 4269 | s = splsoftnet(); |
| 4270 | if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) { |
| 4271 | splx(s); |
| 4272 | return; |
| 4273 | } |
| 4274 | /* If the sequence number != sc_iss, then it's a bogus ICMP msg */ |
| 4275 | if (ntohl (th->th_seq) != sc->sc_iss) { |
| 4276 | splx(s); |
| 4277 | return; |
| 4278 | } |
| 4279 | |
| 4280 | /* |
| 4281 | * If we've retransmitted 3 times and this is our second error, |
| 4282 | * we remove the entry. Otherwise, we allow it to continue on. |
| 4283 | * This prevents us from incorrectly nuking an entry during a |
| 4284 | * spurious network outage. |
| 4285 | * |
| 4286 | * See tcp_notify(). |
| 4287 | */ |
| 4288 | if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxtshift < 3) { |
| 4289 | sc->sc_flags |= SCF_UNREACH; |
| 4290 | splx(s); |
| 4291 | return; |
| 4292 | } |
| 4293 | |
| 4294 | syn_cache_rm(sc); |
| 4295 | TCP_STATINC(TCP_STAT_SC_UNREACH); |
| 4296 | syn_cache_put(sc); /* calls pool_put but see spl above */ |
| 4297 | splx(s); |
| 4298 | } |
| 4299 | |
| 4300 | /* |
| 4301 | * Given a LISTEN socket and an inbound SYN request, add |
| 4302 | * this to the syn cache, and send back a segment: |
| 4303 | * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK> |
| 4304 | * to the source. |
| 4305 | * |
| 4306 | * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN. |
| 4307 | * Doing so would require that we hold onto the data and deliver it |
| 4308 | * to the application. However, if we are the target of a SYN-flood |
| 4309 | * DoS attack, an attacker could send data which would eventually |
| 4310 | * consume all available buffer space if it were ACKed. By not ACKing |
| 4311 | * the data, we avoid this DoS scenario. |
| 4312 | */ |
| 4313 | |
| 4314 | int |
| 4315 | syn_cache_add(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, |
| 4316 | unsigned int hlen, struct socket *so, struct mbuf *m, u_char *optp, |
| 4317 | int optlen, struct tcp_opt_info *oi) |
| 4318 | { |
| 4319 | struct tcpcb tb, *tp; |
| 4320 | long win; |
| 4321 | struct syn_cache *sc; |
| 4322 | struct syn_cache_head *scp; |
| 4323 | struct mbuf *ipopts; |
| 4324 | struct tcp_opt_info opti; |
| 4325 | int s; |
| 4326 | |
| 4327 | tp = sototcpcb(so); |
| 4328 | |
| 4329 | memset(&opti, 0, sizeof(opti)); |
| 4330 | |
| 4331 | /* |
| 4332 | * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN |
| 4333 | * |
| 4334 | * Note this check is performed in tcp_input() very early on. |
| 4335 | */ |
| 4336 | |
| 4337 | /* |
| 4338 | * Initialize some local state. |
| 4339 | */ |
| 4340 | win = sbspace(&so->so_rcv); |
| 4341 | if (win > TCP_MAXWIN) |
| 4342 | win = TCP_MAXWIN; |
| 4343 | |
| 4344 | switch (src->sa_family) { |
| 4345 | #ifdef INET |
| 4346 | case AF_INET: |
| 4347 | /* |
| 4348 | * Remember the IP options, if any. |
| 4349 | */ |
| 4350 | ipopts = ip_srcroute(); |
| 4351 | break; |
| 4352 | #endif |
| 4353 | default: |
| 4354 | ipopts = NULL; |
| 4355 | } |
| 4356 | |
| 4357 | #ifdef TCP_SIGNATURE |
| 4358 | if (optp || (tp->t_flags & TF_SIGNATURE)) |
| 4359 | #else |
| 4360 | if (optp) |
| 4361 | #endif |
| 4362 | { |
| 4363 | tb.t_flags = tcp_do_rfc1323 ? (TF_REQ_SCALE|TF_REQ_TSTMP) : 0; |
| 4364 | #ifdef TCP_SIGNATURE |
| 4365 | tb.t_flags |= (tp->t_flags & TF_SIGNATURE); |
| 4366 | #endif |
| 4367 | tb.t_state = TCPS_LISTEN; |
| 4368 | if (tcp_dooptions(&tb, optp, optlen, th, m, m->m_pkthdr.len - |
| 4369 | sizeof(struct tcphdr) - optlen - hlen, oi) < 0) |
| 4370 | return (0); |
| 4371 | } else |
| 4372 | tb.t_flags = 0; |
| 4373 | |
| 4374 | /* |
| 4375 | * See if we already have an entry for this connection. |
| 4376 | * If we do, resend the SYN,ACK. We do not count this |
| 4377 | * as a retransmission (XXX though maybe we should). |
| 4378 | */ |
| 4379 | if ((sc = syn_cache_lookup(src, dst, &scp)) != NULL) { |
| 4380 | TCP_STATINC(TCP_STAT_SC_DUPESYN); |
| 4381 | if (ipopts) { |
| 4382 | /* |
| 4383 | * If we were remembering a previous source route, |
| 4384 | * forget it and use the new one we've been given. |
| 4385 | */ |
| 4386 | if (sc->sc_ipopts) |
| 4387 | (void) m_free(sc->sc_ipopts); |
| 4388 | sc->sc_ipopts = ipopts; |
| 4389 | } |
| 4390 | sc->sc_timestamp = tb.ts_recent; |
| 4391 | if (syn_cache_respond(sc, m) == 0) { |
| 4392 | uint64_t *tcps = TCP_STAT_GETREF(); |
| 4393 | tcps[TCP_STAT_SNDACKS]++; |
| 4394 | tcps[TCP_STAT_SNDTOTAL]++; |
| 4395 | TCP_STAT_PUTREF(); |
| 4396 | } |
| 4397 | return (1); |
| 4398 | } |
| 4399 | |
| 4400 | s = splsoftnet(); |
| 4401 | sc = pool_get(&syn_cache_pool, PR_NOWAIT); |
| 4402 | splx(s); |
| 4403 | if (sc == NULL) { |
| 4404 | if (ipopts) |
| 4405 | (void) m_free(ipopts); |
| 4406 | return (0); |
| 4407 | } |
| 4408 | |
| 4409 | /* |
| 4410 | * Fill in the cache, and put the necessary IP and TCP |
| 4411 | * options into the reply. |
| 4412 | */ |
| 4413 | memset(sc, 0, sizeof(struct syn_cache)); |
| 4414 | callout_init(&sc->sc_timer, CALLOUT_MPSAFE); |
| 4415 | bcopy(src, &sc->sc_src, src->sa_len); |
| 4416 | bcopy(dst, &sc->sc_dst, dst->sa_len); |
| 4417 | sc->sc_flags = 0; |
| 4418 | sc->sc_ipopts = ipopts; |
| 4419 | sc->sc_irs = th->th_seq; |
| 4420 | switch (src->sa_family) { |
| 4421 | #ifdef INET |
| 4422 | case AF_INET: |
| 4423 | { |
| 4424 | struct sockaddr_in *srcin = (void *) src; |
| 4425 | struct sockaddr_in *dstin = (void *) dst; |
| 4426 | |
| 4427 | sc->sc_iss = tcp_new_iss1(&dstin->sin_addr, |
| 4428 | &srcin->sin_addr, dstin->sin_port, |
| 4429 | srcin->sin_port, sizeof(dstin->sin_addr), 0); |
| 4430 | break; |
| 4431 | } |
| 4432 | #endif /* INET */ |
| 4433 | #ifdef INET6 |
| 4434 | case AF_INET6: |
| 4435 | { |
| 4436 | struct sockaddr_in6 *srcin6 = (void *) src; |
| 4437 | struct sockaddr_in6 *dstin6 = (void *) dst; |
| 4438 | |
| 4439 | sc->sc_iss = tcp_new_iss1(&dstin6->sin6_addr, |
| 4440 | &srcin6->sin6_addr, dstin6->sin6_port, |
| 4441 | srcin6->sin6_port, sizeof(dstin6->sin6_addr), 0); |
| 4442 | break; |
| 4443 | } |
| 4444 | #endif /* INET6 */ |
| 4445 | } |
| 4446 | sc->sc_peermaxseg = oi->maxseg; |
| 4447 | sc->sc_ourmaxseg = tcp_mss_to_advertise(m->m_flags & M_PKTHDR ? |
| 4448 | m_get_rcvif_NOMPSAFE(m) : NULL, |
| 4449 | sc->sc_src.sa.sa_family); |
| 4450 | sc->sc_win = win; |
| 4451 | sc->sc_timebase = tcp_now - 1; /* see tcp_newtcpcb() */ |
| 4452 | sc->sc_timestamp = tb.ts_recent; |
| 4453 | if ((tb.t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP)) == |
| 4454 | (TF_REQ_TSTMP|TF_RCVD_TSTMP)) |
| 4455 | sc->sc_flags |= SCF_TIMESTAMP; |
| 4456 | if ((tb.t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == |
| 4457 | (TF_RCVD_SCALE|TF_REQ_SCALE)) { |
| 4458 | sc->sc_requested_s_scale = tb.requested_s_scale; |
| 4459 | sc->sc_request_r_scale = 0; |
| 4460 | /* |
| 4461 | * Pick the smallest possible scaling factor that |
| 4462 | * will still allow us to scale up to sb_max. |
| 4463 | * |
| 4464 | * We do this because there are broken firewalls that |
| 4465 | * will corrupt the window scale option, leading to |
| 4466 | * the other endpoint believing that our advertised |
| 4467 | * window is unscaled. At scale factors larger than |
| 4468 | * 5 the unscaled window will drop below 1500 bytes, |
| 4469 | * leading to serious problems when traversing these |
| 4470 | * broken firewalls. |
| 4471 | * |
| 4472 | * With the default sbmax of 256K, a scale factor |
| 4473 | * of 3 will be chosen by this algorithm. Those who |
| 4474 | * choose a larger sbmax should watch out |
| 4475 | * for the compatiblity problems mentioned above. |
| 4476 | * |
| 4477 | * RFC1323: The Window field in a SYN (i.e., a <SYN> |
| 4478 | * or <SYN,ACK>) segment itself is never scaled. |
| 4479 | */ |
| 4480 | while (sc->sc_request_r_scale < TCP_MAX_WINSHIFT && |
| 4481 | (TCP_MAXWIN << sc->sc_request_r_scale) < sb_max) |
| 4482 | sc->sc_request_r_scale++; |
| 4483 | } else { |
| 4484 | sc->sc_requested_s_scale = 15; |
| 4485 | sc->sc_request_r_scale = 15; |
| 4486 | } |
| 4487 | if ((tb.t_flags & TF_SACK_PERMIT) && tcp_do_sack) |
| 4488 | sc->sc_flags |= SCF_SACK_PERMIT; |
| 4489 | |
| 4490 | /* |
| 4491 | * ECN setup packet recieved. |
| 4492 | */ |
| 4493 | if ((th->th_flags & (TH_ECE|TH_CWR)) && tcp_do_ecn) |
| 4494 | sc->sc_flags |= SCF_ECN_PERMIT; |
| 4495 | |
| 4496 | #ifdef TCP_SIGNATURE |
| 4497 | if (tb.t_flags & TF_SIGNATURE) |
| 4498 | sc->sc_flags |= SCF_SIGNATURE; |
| 4499 | #endif |
| 4500 | sc->sc_tp = tp; |
| 4501 | if (syn_cache_respond(sc, m) == 0) { |
| 4502 | uint64_t *tcps = TCP_STAT_GETREF(); |
| 4503 | tcps[TCP_STAT_SNDACKS]++; |
| 4504 | tcps[TCP_STAT_SNDTOTAL]++; |
| 4505 | TCP_STAT_PUTREF(); |
| 4506 | syn_cache_insert(sc, tp); |
| 4507 | } else { |
| 4508 | s = splsoftnet(); |
| 4509 | /* |
| 4510 | * syn_cache_put() will try to schedule the timer, so |
| 4511 | * we need to initialize it |
| 4512 | */ |
| 4513 | SYN_CACHE_TIMER_ARM(sc); |
| 4514 | syn_cache_put(sc); |
| 4515 | splx(s); |
| 4516 | TCP_STATINC(TCP_STAT_SC_DROPPED); |
| 4517 | } |
| 4518 | return (1); |
| 4519 | } |
| 4520 | |
| 4521 | /* |
| 4522 | * syn_cache_respond: (re)send SYN+ACK. |
| 4523 | * |
| 4524 | * returns 0 on success. otherwise returns an errno, typically ENOBUFS. |
| 4525 | */ |
| 4526 | |
| 4527 | int |
| 4528 | syn_cache_respond(struct syn_cache *sc, struct mbuf *m) |
| 4529 | { |
| 4530 | #ifdef INET6 |
| 4531 | struct rtentry *rt; |
| 4532 | #endif |
| 4533 | struct route *ro; |
| 4534 | u_int8_t *optp; |
| 4535 | int optlen, error; |
| 4536 | u_int16_t tlen; |
| 4537 | struct ip *ip = NULL; |
| 4538 | #ifdef INET6 |
| 4539 | struct ip6_hdr *ip6 = NULL; |
| 4540 | #endif |
| 4541 | struct tcpcb *tp = NULL; |
| 4542 | struct tcphdr *th; |
| 4543 | u_int hlen; |
| 4544 | struct socket *so; |
| 4545 | |
| 4546 | ro = &sc->sc_route; |
| 4547 | switch (sc->sc_src.sa.sa_family) { |
| 4548 | case AF_INET: |
| 4549 | hlen = sizeof(struct ip); |
| 4550 | break; |
| 4551 | #ifdef INET6 |
| 4552 | case AF_INET6: |
| 4553 | hlen = sizeof(struct ip6_hdr); |
| 4554 | break; |
| 4555 | #endif |
| 4556 | default: |
| 4557 | if (m) |
| 4558 | m_freem(m); |
| 4559 | return (EAFNOSUPPORT); |
| 4560 | } |
| 4561 | |
| 4562 | /* Compute the size of the TCP options. */ |
| 4563 | optlen = 4 + (sc->sc_request_r_scale != 15 ? 4 : 0) + |
| 4564 | ((sc->sc_flags & SCF_SACK_PERMIT) ? (TCPOLEN_SACK_PERMITTED + 2) : 0) + |
| 4565 | #ifdef TCP_SIGNATURE |
| 4566 | ((sc->sc_flags & SCF_SIGNATURE) ? (TCPOLEN_SIGNATURE + 2) : 0) + |
| 4567 | #endif |
| 4568 | ((sc->sc_flags & SCF_TIMESTAMP) ? TCPOLEN_TSTAMP_APPA : 0); |
| 4569 | |
| 4570 | tlen = hlen + sizeof(struct tcphdr) + optlen; |
| 4571 | |
| 4572 | /* |
| 4573 | * Create the IP+TCP header from scratch. |
| 4574 | */ |
| 4575 | if (m) |
| 4576 | m_freem(m); |
| 4577 | #ifdef DIAGNOSTIC |
| 4578 | if (max_linkhdr + tlen > MCLBYTES) |
| 4579 | return (ENOBUFS); |
| 4580 | #endif |
| 4581 | MGETHDR(m, M_DONTWAIT, MT_DATA); |
| 4582 | if (m && (max_linkhdr + tlen) > MHLEN) { |
| 4583 | MCLGET(m, M_DONTWAIT); |
| 4584 | if ((m->m_flags & M_EXT) == 0) { |
| 4585 | m_freem(m); |
| 4586 | m = NULL; |
| 4587 | } |
| 4588 | } |
| 4589 | if (m == NULL) |
| 4590 | return (ENOBUFS); |
| 4591 | MCLAIM(m, &tcp_tx_mowner); |
| 4592 | |
| 4593 | /* Fixup the mbuf. */ |
| 4594 | m->m_data += max_linkhdr; |
| 4595 | m->m_len = m->m_pkthdr.len = tlen; |
| 4596 | if (sc->sc_tp) { |
| 4597 | tp = sc->sc_tp; |
| 4598 | if (tp->t_inpcb) |
| 4599 | so = tp->t_inpcb->inp_socket; |
| 4600 | #ifdef INET6 |
| 4601 | else if (tp->t_in6pcb) |
| 4602 | so = tp->t_in6pcb->in6p_socket; |
| 4603 | #endif |
| 4604 | else |
| 4605 | so = NULL; |
| 4606 | } else |
| 4607 | so = NULL; |
| 4608 | m_reset_rcvif(m); |
| 4609 | memset(mtod(m, u_char *), 0, tlen); |
| 4610 | |
| 4611 | switch (sc->sc_src.sa.sa_family) { |
| 4612 | case AF_INET: |
| 4613 | ip = mtod(m, struct ip *); |
| 4614 | ip->ip_v = 4; |
| 4615 | ip->ip_dst = sc->sc_src.sin.sin_addr; |
| 4616 | ip->ip_src = sc->sc_dst.sin.sin_addr; |
| 4617 | ip->ip_p = IPPROTO_TCP; |
| 4618 | th = (struct tcphdr *)(ip + 1); |
| 4619 | th->th_dport = sc->sc_src.sin.sin_port; |
| 4620 | th->th_sport = sc->sc_dst.sin.sin_port; |
| 4621 | break; |
| 4622 | #ifdef INET6 |
| 4623 | case AF_INET6: |
| 4624 | ip6 = mtod(m, struct ip6_hdr *); |
| 4625 | ip6->ip6_vfc = IPV6_VERSION; |
| 4626 | ip6->ip6_dst = sc->sc_src.sin6.sin6_addr; |
| 4627 | ip6->ip6_src = sc->sc_dst.sin6.sin6_addr; |
| 4628 | ip6->ip6_nxt = IPPROTO_TCP; |
| 4629 | /* ip6_plen will be updated in ip6_output() */ |
| 4630 | th = (struct tcphdr *)(ip6 + 1); |
| 4631 | th->th_dport = sc->sc_src.sin6.sin6_port; |
| 4632 | th->th_sport = sc->sc_dst.sin6.sin6_port; |
| 4633 | break; |
| 4634 | #endif |
| 4635 | default: |
| 4636 | th = NULL; |
| 4637 | } |
| 4638 | |
| 4639 | th->th_seq = htonl(sc->sc_iss); |
| 4640 | th->th_ack = htonl(sc->sc_irs + 1); |
| 4641 | th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; |
| 4642 | th->th_flags = TH_SYN|TH_ACK; |
| 4643 | th->th_win = htons(sc->sc_win); |
| 4644 | /* th_sum already 0 */ |
| 4645 | /* th_urp already 0 */ |
| 4646 | |
| 4647 | /* Tack on the TCP options. */ |
| 4648 | optp = (u_int8_t *)(th + 1); |
| 4649 | *optp++ = TCPOPT_MAXSEG; |
| 4650 | *optp++ = 4; |
| 4651 | *optp++ = (sc->sc_ourmaxseg >> 8) & 0xff; |
| 4652 | *optp++ = sc->sc_ourmaxseg & 0xff; |
| 4653 | |
| 4654 | if (sc->sc_request_r_scale != 15) { |
| 4655 | *((u_int32_t *)optp) = htonl(TCPOPT_NOP << 24 | |
| 4656 | TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 | |
| 4657 | sc->sc_request_r_scale); |
| 4658 | optp += 4; |
| 4659 | } |
| 4660 | |
| 4661 | if (sc->sc_flags & SCF_TIMESTAMP) { |
| 4662 | u_int32_t *lp = (u_int32_t *)(optp); |
| 4663 | /* Form timestamp option as shown in appendix A of RFC 1323. */ |
| 4664 | *lp++ = htonl(TCPOPT_TSTAMP_HDR); |
| 4665 | *lp++ = htonl(SYN_CACHE_TIMESTAMP(sc)); |
| 4666 | *lp = htonl(sc->sc_timestamp); |
| 4667 | optp += TCPOLEN_TSTAMP_APPA; |
| 4668 | } |
| 4669 | |
| 4670 | if (sc->sc_flags & SCF_SACK_PERMIT) { |
| 4671 | u_int8_t *p = optp; |
| 4672 | |
| 4673 | /* Let the peer know that we will SACK. */ |
| 4674 | p[0] = TCPOPT_SACK_PERMITTED; |
| 4675 | p[1] = 2; |
| 4676 | p[2] = TCPOPT_NOP; |
| 4677 | p[3] = TCPOPT_NOP; |
| 4678 | optp += 4; |
| 4679 | } |
| 4680 | |
| 4681 | /* |
| 4682 | * Send ECN SYN-ACK setup packet. |
| 4683 | * Routes can be asymetric, so, even if we receive a packet |
| 4684 | * with ECE and CWR set, we must not assume no one will block |
| 4685 | * the ECE packet we are about to send. |
| 4686 | */ |
| 4687 | if ((sc->sc_flags & SCF_ECN_PERMIT) && tp && |
| 4688 | SEQ_GEQ(tp->snd_nxt, tp->snd_max)) { |
| 4689 | th->th_flags |= TH_ECE; |
| 4690 | TCP_STATINC(TCP_STAT_ECN_SHS); |
| 4691 | |
| 4692 | /* |
| 4693 | * draft-ietf-tcpm-ecnsyn-00.txt |
| 4694 | * |
| 4695 | * "[...] a TCP node MAY respond to an ECN-setup |
| 4696 | * SYN packet by setting ECT in the responding |
| 4697 | * ECN-setup SYN/ACK packet, indicating to routers |
| 4698 | * that the SYN/ACK packet is ECN-Capable. |
| 4699 | * This allows a congested router along the path |
| 4700 | * to mark the packet instead of dropping the |
| 4701 | * packet as an indication of congestion." |
| 4702 | * |
| 4703 | * "[...] There can be a great benefit in setting |
| 4704 | * an ECN-capable codepoint in SYN/ACK packets [...] |
| 4705 | * Congestion is most likely to occur in |
| 4706 | * the server-to-client direction. As a result, |
| 4707 | * setting an ECN-capable codepoint in SYN/ACK |
| 4708 | * packets can reduce the occurence of three-second |
| 4709 | * retransmit timeouts resulting from the drop |
| 4710 | * of SYN/ACK packets." |
| 4711 | * |
| 4712 | * Page 4 and 6, January 2006. |
| 4713 | */ |
| 4714 | |
| 4715 | switch (sc->sc_src.sa.sa_family) { |
| 4716 | #ifdef INET |
| 4717 | case AF_INET: |
| 4718 | ip->ip_tos |= IPTOS_ECN_ECT0; |
| 4719 | break; |
| 4720 | #endif |
| 4721 | #ifdef INET6 |
| 4722 | case AF_INET6: |
| 4723 | ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20); |
| 4724 | break; |
| 4725 | #endif |
| 4726 | } |
| 4727 | TCP_STATINC(TCP_STAT_ECN_ECT); |
| 4728 | } |
| 4729 | |
| 4730 | #ifdef TCP_SIGNATURE |
| 4731 | if (sc->sc_flags & SCF_SIGNATURE) { |
| 4732 | struct secasvar *sav; |
| 4733 | u_int8_t *sigp; |
| 4734 | |
| 4735 | sav = tcp_signature_getsav(m, th); |
| 4736 | |
| 4737 | if (sav == NULL) { |
| 4738 | if (m) |
| 4739 | m_freem(m); |
| 4740 | return (EPERM); |
| 4741 | } |
| 4742 | |
| 4743 | *optp++ = TCPOPT_SIGNATURE; |
| 4744 | *optp++ = TCPOLEN_SIGNATURE; |
| 4745 | sigp = optp; |
| 4746 | memset(optp, 0, TCP_SIGLEN); |
| 4747 | optp += TCP_SIGLEN; |
| 4748 | *optp++ = TCPOPT_NOP; |
| 4749 | *optp++ = TCPOPT_EOL; |
| 4750 | |
| 4751 | (void)tcp_signature(m, th, hlen, sav, sigp); |
| 4752 | |
| 4753 | key_sa_recordxfer(sav, m); |
| 4754 | KEY_FREESAV(&sav); |
| 4755 | } |
| 4756 | #endif |
| 4757 | |
| 4758 | /* Compute the packet's checksum. */ |
| 4759 | switch (sc->sc_src.sa.sa_family) { |
| 4760 | case AF_INET: |
| 4761 | ip->ip_len = htons(tlen - hlen); |
| 4762 | th->th_sum = 0; |
| 4763 | th->th_sum = in4_cksum(m, IPPROTO_TCP, hlen, tlen - hlen); |
| 4764 | break; |
| 4765 | #ifdef INET6 |
| 4766 | case AF_INET6: |
| 4767 | ip6->ip6_plen = htons(tlen - hlen); |
| 4768 | th->th_sum = 0; |
| 4769 | th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen, tlen - hlen); |
| 4770 | break; |
| 4771 | #endif |
| 4772 | } |
| 4773 | |
| 4774 | /* |
| 4775 | * Fill in some straggling IP bits. Note the stack expects |
| 4776 | * ip_len to be in host order, for convenience. |
| 4777 | */ |
| 4778 | switch (sc->sc_src.sa.sa_family) { |
| 4779 | #ifdef INET |
| 4780 | case AF_INET: |
| 4781 | ip->ip_len = htons(tlen); |
| 4782 | ip->ip_ttl = ip_defttl; |
| 4783 | /* XXX tos? */ |
| 4784 | break; |
| 4785 | #endif |
| 4786 | #ifdef INET6 |
| 4787 | case AF_INET6: |
| 4788 | ip6->ip6_vfc &= ~IPV6_VERSION_MASK; |
| 4789 | ip6->ip6_vfc |= IPV6_VERSION; |
| 4790 | ip6->ip6_plen = htons(tlen - hlen); |
| 4791 | /* ip6_hlim will be initialized afterwards */ |
| 4792 | /* XXX flowlabel? */ |
| 4793 | break; |
| 4794 | #endif |
| 4795 | } |
| 4796 | |
| 4797 | /* XXX use IPsec policy on listening socket, on SYN ACK */ |
| 4798 | tp = sc->sc_tp; |
| 4799 | |
| 4800 | switch (sc->sc_src.sa.sa_family) { |
| 4801 | #ifdef INET |
| 4802 | case AF_INET: |
| 4803 | error = ip_output(m, sc->sc_ipopts, ro, |
| 4804 | (ip_mtudisc ? IP_MTUDISC : 0), |
| 4805 | NULL, so); |
| 4806 | break; |
| 4807 | #endif |
| 4808 | #ifdef INET6 |
| 4809 | case AF_INET6: |
| 4810 | ip6->ip6_hlim = in6_selecthlim(NULL, |
| 4811 | (rt = rtcache_validate(ro)) != NULL ? rt->rt_ifp : NULL); |
| 4812 | |
| 4813 | error = ip6_output(m, NULL /*XXX*/, ro, 0, NULL, so, NULL); |
| 4814 | break; |
| 4815 | #endif |
| 4816 | default: |
| 4817 | error = EAFNOSUPPORT; |
| 4818 | break; |
| 4819 | } |
| 4820 | return (error); |
| 4821 | } |
| 4822 | |