| 1 | /* $NetBSD: tcp_vtw.h,v 1.7 2016/04/26 08:44:45 ozaki-r Exp $ */ |
| 2 | /* |
| 3 | * Copyright (c) 2011 The NetBSD Foundation, Inc. |
| 4 | * All rights reserved. |
| 5 | * |
| 6 | * This code is derived from software contributed to The NetBSD Foundation |
| 7 | * by Coyote Point Systems, Inc. |
| 8 | * |
| 9 | * Redistribution and use in source and binary forms, with or without |
| 10 | * modification, are permitted provided that the following conditions |
| 11 | * are met: |
| 12 | * 1. Redistributions of source code must retain the above copyright |
| 13 | * notice, this list of conditions and the following disclaimer. |
| 14 | * 2. Redistributions in binary form must reproduce the above copyright |
| 15 | * notice, this list of conditions and the following disclaimer in the |
| 16 | * documentation and/or other materials provided with the distribution. |
| 17 | * |
| 18 | * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS |
| 19 | * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED |
| 20 | * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
| 21 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS |
| 22 | * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
| 23 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
| 24 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
| 25 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
| 26 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
| 27 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
| 28 | * POSSIBILITY OF SUCH DAMAGE. |
| 29 | */ |
| 30 | |
| 31 | /* |
| 32 | * Vestigial time-wait. |
| 33 | * |
| 34 | * This implementation uses cache-efficient techniques, which will |
| 35 | * appear somewhat peculiar. The main philosophy is to optimise the |
| 36 | * amount of information available within a cache line. Cache miss is |
| 37 | * expensive. So we employ ad-hoc techniques to pull a series of |
| 38 | * linked-list follows into a cache line. One cache line, multiple |
| 39 | * linked-list equivalents. |
| 40 | * |
| 41 | * One such ad-hoc technique is fat pointers. Additional degrees of |
| 42 | * ad-hoqueness result from having to hand tune it for pointer size |
| 43 | * and for cache line size. |
| 44 | * |
| 45 | * The 'fat pointer' approach aggregates, for x86_32, 15 linked-list |
| 46 | * data structures into one cache line. The additional 32 bits in the |
| 47 | * cache line are used for linking fat pointers, and for |
| 48 | * allocation/bookkeeping. |
| 49 | * |
| 50 | * The 15 32-bit tags encode the pointers to the linked list elements, |
| 51 | * and also encode the results of a search comparison. |
| 52 | * |
| 53 | * First, some more assumptions/restrictions. |
| 54 | * |
| 55 | * All the fat pointers are from a contiguous allocation arena. Thus, |
| 56 | * we can refer to them by offset from a base, not as full pointers. |
| 57 | * |
| 58 | * All the linked list data elements are also from a contiguous |
| 59 | * allocation arena, again so that we can refer to them as offset from |
| 60 | * a base. |
| 61 | * |
| 62 | * In order to add a data element to a fat pointer, a key value is |
| 63 | * computed, based on unique data within the data element. It is the |
| 64 | * linear searching of the linked lists of these elements based on |
| 65 | * these unique data that are being optimised here. |
| 66 | * |
| 67 | * Lets call the function that computes the key k(e), where e is the |
| 68 | * data element. In this example, k(e) returns 32-bits. |
| 69 | * |
| 70 | * Consider a set E (say of order 15) of data elements. Let K be |
| 71 | * the set of the k(e) for e in E. |
| 72 | * |
| 73 | * Let O be the set of the offsets from the base of the data elements in E. |
| 74 | * |
| 75 | * For each x in K, for each matching o in O, let t be x ^ o. These |
| 76 | * are the tags. (More or less). |
| 77 | * |
| 78 | * In order to search all the data elements in E, we compute the |
| 79 | * search key, and one at a time, XOR the key into the tags. If any |
| 80 | * result is a valid data element index, we have a possible match. If |
| 81 | * not, there is no match. |
| 82 | * |
| 83 | * The no-match cases mean we do not have to de-reference the pointer |
| 84 | * to the data element in question. We save cache miss penalty and |
| 85 | * cache load decreases. Only in the case of a valid looking data |
| 86 | * element index, do we have to look closer. |
| 87 | * |
| 88 | * Thus, in the absence of false positives, 15 data elements can be |
| 89 | * searched with one cache line fill, as opposed to 15 cache line |
| 90 | * fills for the usual implementation. |
| 91 | * |
| 92 | * The vestigial time waits (vtw_t), the data elements in the above, are |
| 93 | * searched by faddr, fport, laddr, lport. The key is a function of |
| 94 | * these values. |
| 95 | * |
| 96 | * We hash these keys into the traditional hash chains to reduce the |
| 97 | * search time, and use fat pointers to reduce the cache impacts of |
| 98 | * searching. |
| 99 | * |
| 100 | * The vtw_t are, per requirement, in a contiguous chunk. Allocation |
| 101 | * is done with a clock hand, and all vtw_t within one allocation |
| 102 | * domain have the same lifetime, so they will always be sorted by |
| 103 | * age. |
| 104 | * |
| 105 | * A vtw_t will be allocated, timestamped, and have a fixed future |
| 106 | * expiration. It will be added to a hash bucket implemented with fat |
| 107 | * pointers, which means that a cache line will be allocated in the |
| 108 | * hash bucket, placed at the head (more recent in time) and the vtw_t |
| 109 | * will be added to this. As more entries are added, the fat pointer |
| 110 | * cache line will fill, requiring additional cache lines for fat |
| 111 | * pointers to be allocated. These will be added at the head, and the |
| 112 | * aged entries will hang down, tapeworm like. As the vtw_t entries |
| 113 | * expire, the corresponding slot in the fat pointer will be |
| 114 | * reclaimed, and eventually the cache line will completely empty and |
| 115 | * be re-cycled, if not at the head of the chain. |
| 116 | * |
| 117 | * At times, a time-wait timer is restarted. This corresponds to |
| 118 | * deleting the current entry and re-adding it. |
| 119 | * |
| 120 | * Most of the time, they are just placed here to die. |
| 121 | */ |
| 122 | #ifndef _NETINET_TCP_VTW_H |
| 123 | #define _NETINET_TCP_VTW_H |
| 124 | |
| 125 | #include <sys/types.h> |
| 126 | #include <sys/socket.h> |
| 127 | #include <sys/sysctl.h> |
| 128 | #include <net/if.h> |
| 129 | #include <netinet/in.h> |
| 130 | #include <netinet/in_systm.h> |
| 131 | #include <netinet/ip.h> |
| 132 | #include <netinet/in_pcb.h> |
| 133 | #include <netinet/in_var.h> |
| 134 | #include <netinet/ip_var.h> |
| 135 | #include <netinet/in.h> |
| 136 | #include <netinet/tcp.h> |
| 137 | #include <netinet/tcp_timer.h> |
| 138 | #include <netinet/tcp_var.h> |
| 139 | #include <netinet6/in6.h> |
| 140 | #include <netinet/ip6.h> |
| 141 | #include <netinet6/ip6_var.h> |
| 142 | #include <netinet6/in6_pcb.h> |
| 143 | #include <netinet6/ip6_var.h> |
| 144 | #include <netinet6/in6_var.h> |
| 145 | #include <netinet/icmp6.h> |
| 146 | #include <netinet6/nd6.h> |
| 147 | |
| 148 | #define VTW_NCLASS (1+3) /* # different classes */ |
| 149 | |
| 150 | /* |
| 151 | * fat pointers, MI. |
| 152 | */ |
| 153 | struct fatp_mi; |
| 154 | |
| 155 | typedef uint32_t fatp_word_t; |
| 156 | |
| 157 | typedef struct fatp_mi fatp_t; |
| 158 | |
| 159 | /* Supported cacheline sizes: 32 64 128 bytes. See fatp_key(), |
| 160 | * fatp_slot_from_key(), fatp_xtra[]. |
| 161 | */ |
| 162 | #define FATP_NTAGS (CACHE_LINE_SIZE / sizeof(fatp_word_t) - 1) |
| 163 | #define FATP_NXT_WIDTH (sizeof(fatp_word_t) * NBBY - FATP_NTAGS) |
| 164 | |
| 165 | #define FATP_MAX (1 << FATP_NXT_WIDTH) |
| 166 | |
| 167 | /* Worked example: ULP32 with 64-byte cacheline (32-bit x86): |
| 168 | * 15 tags per cacheline. At most 2^17 fat pointers per fatp_ctl_t. |
| 169 | * The comments on the fatp_mi members, below, correspond to the worked |
| 170 | * example. |
| 171 | */ |
| 172 | struct fatp_mi { |
| 173 | fatp_word_t inuse : FATP_NTAGS; /* (1+15)*4 == CL_SIZE */ |
| 174 | fatp_word_t nxt : FATP_NXT_WIDTH;/* at most 2^17 fat pointers */ |
| 175 | fatp_word_t tag[FATP_NTAGS]; /* 15 tags per CL */ |
| 176 | }; |
| 177 | |
| 178 | static inline int |
| 179 | fatp_ntags(void) |
| 180 | { |
| 181 | return FATP_NTAGS; |
| 182 | } |
| 183 | |
| 184 | static inline int |
| 185 | fatp_full(fatp_t *fp) |
| 186 | { |
| 187 | fatp_t full; |
| 188 | |
| 189 | full.inuse = (1U << FATP_NTAGS) - 1U; |
| 190 | |
| 191 | return (fp->inuse == full.inuse); |
| 192 | } |
| 193 | |
| 194 | struct vtw_common; |
| 195 | struct vtw_v4; |
| 196 | struct vtw_v6; |
| 197 | struct vtw_ctl; |
| 198 | |
| 199 | /*!\brief common to all vtw |
| 200 | */ |
| 201 | typedef struct vtw_common { |
| 202 | struct timeval expire; /* date of birth+msl */ |
| 203 | uint32_t key; /* hash key: full hash */ |
| 204 | uint32_t port_key; /* hash key: local port hash */ |
| 205 | uint32_t rcv_nxt; |
| 206 | uint32_t rcv_wnd; |
| 207 | uint32_t snd_nxt; |
| 208 | uint32_t snd_scale : 8; /* window scaling for send win */ |
| 209 | uint32_t msl_class : 2; /* TCP MSL class {0,1,2,3} */ |
| 210 | uint32_t reuse_port : 1; |
| 211 | uint32_t reuse_addr : 1; |
| 212 | uint32_t v6only : 1; |
| 213 | uint32_t hashed : 1; /* reachable via FATP */ |
| 214 | uint32_t uid; |
| 215 | } vtw_t; |
| 216 | |
| 217 | /*!\brief vestigial timewait for IPv4 |
| 218 | */ |
| 219 | typedef struct vtw_v4 { |
| 220 | vtw_t common; /* must be first */ |
| 221 | uint16_t lport; |
| 222 | uint16_t fport; |
| 223 | uint32_t laddr; |
| 224 | uint32_t faddr; |
| 225 | } vtw_v4_t; |
| 226 | |
| 227 | /*!\brief vestigial timewait for IPv6 |
| 228 | */ |
| 229 | typedef struct vtw_v6 { |
| 230 | vtw_t common; /* must be first */ |
| 231 | uint16_t lport; |
| 232 | uint16_t fport; |
| 233 | struct in6_addr laddr; |
| 234 | struct in6_addr faddr; |
| 235 | } vtw_v6_t; |
| 236 | |
| 237 | struct fatp_ctl; |
| 238 | typedef struct vtw_ctl vtw_ctl_t; |
| 239 | typedef struct fatp_ctl fatp_ctl_t; |
| 240 | |
| 241 | /* |
| 242 | * The vestigial time waits are kept in a contiguous chunk. |
| 243 | * Allocation and free pointers run as clock hands thru this array. |
| 244 | */ |
| 245 | struct vtw_ctl { |
| 246 | fatp_ctl_t *fat; /* collection of fatp to use */ |
| 247 | vtw_ctl_t *ctl; /* <! controller's controller */ |
| 248 | union { |
| 249 | vtw_t *v; /* common */ |
| 250 | struct vtw_v4 *v4; /* IPv4 resources */ |
| 251 | struct vtw_v6 *v6; /* IPv6 resources */ |
| 252 | } base, /* base of vtw_t array */ |
| 253 | /**/ lim, /* extent of vtw_t array */ |
| 254 | /**/ alloc, /* allocation pointer */ |
| 255 | /**/ oldest; /* ^ to oldest */ |
| 256 | uint32_t nfree; /* # free */ |
| 257 | uint32_t nalloc; /* # allocated */ |
| 258 | uint32_t idx_mask; /* mask capturing all index bits*/ |
| 259 | uint32_t is_v4 : 1; |
| 260 | uint32_t is_v6 : 1; |
| 261 | uint32_t idx_bits: 6; |
| 262 | uint32_t clidx : 3; /* <! class index */ |
| 263 | }; |
| 264 | |
| 265 | /*!\brief Collections of fat pointers. |
| 266 | */ |
| 267 | struct fatp_ctl { |
| 268 | vtw_ctl_t *vtw; /* associated VTWs */ |
| 269 | fatp_t *base; /* base of fatp_t array */ |
| 270 | fatp_t *lim; /* extent of fatp_t array */ |
| 271 | fatp_t *free; /* free list */ |
| 272 | uint32_t mask; /* hash mask */ |
| 273 | uint32_t nfree; /* # free */ |
| 274 | uint32_t nalloc; /* # allocated */ |
| 275 | fatp_t **hash; /* hash anchors */ |
| 276 | fatp_t **port; /* port hash anchors */ |
| 277 | }; |
| 278 | |
| 279 | /*!\brief stats |
| 280 | */ |
| 281 | struct vtw_stats { |
| 282 | uint64_t ins; /* <! inserts */ |
| 283 | uint64_t del; /* <! deleted */ |
| 284 | uint64_t kill; /* <! assassination */ |
| 285 | uint64_t look[2]; /* <! lookup: full hash, port hash */ |
| 286 | uint64_t hit[2]; /* <! lookups that hit */ |
| 287 | uint64_t miss[2]; /* <! lookups that miss */ |
| 288 | uint64_t probe[2]; /* <! hits+miss */ |
| 289 | uint64_t losing[2]; /* <! misses requiring dereference */ |
| 290 | uint64_t max_chain[2]; /* <! max fatp chain traversed */ |
| 291 | uint64_t max_probe[2]; /* <! max probes in any one chain */ |
| 292 | uint64_t max_loss[2]; /* <! max losing probes in any one |
| 293 | * chain |
| 294 | */ |
| 295 | }; |
| 296 | |
| 297 | typedef struct vtw_stats vtw_stats_t; |
| 298 | |
| 299 | /*!\brief follow fatp next 'pointer' |
| 300 | */ |
| 301 | static inline fatp_t * |
| 302 | fatp_next(fatp_ctl_t *fat, fatp_t *fp) |
| 303 | { |
| 304 | return fp->nxt ? fat->base + fp->nxt-1 : 0; |
| 305 | } |
| 306 | |
| 307 | /*!\brief determine a collection-relative fat pointer index. |
| 308 | */ |
| 309 | static inline uint32_t |
| 310 | fatp_index(fatp_ctl_t *fat, fatp_t *fp) |
| 311 | { |
| 312 | return fp ? 1 + (fp - fat->base) : 0; |
| 313 | } |
| 314 | |
| 315 | |
| 316 | static inline uint32_t |
| 317 | v4_tag(uint32_t faddr, uint32_t fport, uint32_t laddr, uint32_t lport) |
| 318 | { |
| 319 | return (ntohl(faddr) + ntohs(fport) |
| 320 | + ntohl(laddr) + ntohs(lport)); |
| 321 | } |
| 322 | |
| 323 | static inline uint32_t |
| 324 | v6_tag(const struct in6_addr *faddr, uint16_t fport, |
| 325 | const struct in6_addr *laddr, uint16_t lport) |
| 326 | { |
| 327 | #ifdef IN6_HASH |
| 328 | return IN6_HASH(faddr, fport, laddr, lport); |
| 329 | #else |
| 330 | return 0; |
| 331 | #endif |
| 332 | } |
| 333 | |
| 334 | static inline uint32_t |
| 335 | v4_port_tag(uint16_t lport) |
| 336 | { |
| 337 | uint32_t tag = lport ^ (lport << 11); |
| 338 | |
| 339 | tag ^= tag << 3; |
| 340 | tag += tag >> 5; |
| 341 | tag ^= tag << 4; |
| 342 | tag += tag >> 17; |
| 343 | tag ^= tag << 25; |
| 344 | tag += tag >> 6; |
| 345 | |
| 346 | return tag; |
| 347 | } |
| 348 | |
| 349 | static inline uint32_t |
| 350 | v6_port_tag(uint16_t lport) |
| 351 | { |
| 352 | return v4_port_tag(lport); |
| 353 | } |
| 354 | |
| 355 | struct tcpcb; |
| 356 | struct tcphdr; |
| 357 | |
| 358 | int vtw_add(int, struct tcpcb *); |
| 359 | void vtw_del(vtw_ctl_t *, vtw_t *); |
| 360 | int vtw_lookup_v4(const struct ip *ip, const struct tcphdr *th, |
| 361 | uint32_t faddr, uint16_t fport, |
| 362 | uint32_t laddr, uint16_t lport); |
| 363 | struct ip6_hdr; |
| 364 | struct in6_addr; |
| 365 | |
| 366 | int vtw_lookup_v6(const struct ip6_hdr *ip, const struct tcphdr *th, |
| 367 | const struct in6_addr *faddr, uint16_t fport, |
| 368 | const struct in6_addr *laddr, uint16_t lport); |
| 369 | |
| 370 | typedef struct vestigial_inpcb { |
| 371 | union { |
| 372 | struct in_addr v4; |
| 373 | struct in6_addr v6; |
| 374 | } faddr, laddr; |
| 375 | uint16_t fport, lport; |
| 376 | uint32_t valid : 1; |
| 377 | uint32_t v4 : 1; |
| 378 | uint32_t reuse_addr : 1; |
| 379 | uint32_t reuse_port : 1; |
| 380 | uint32_t v6only : 1; |
| 381 | uint32_t more_tbd : 1; |
| 382 | uint32_t uid; |
| 383 | uint32_t rcv_nxt; |
| 384 | uint32_t rcv_wnd; |
| 385 | uint32_t snd_nxt; |
| 386 | struct vtw_common *vtw; |
| 387 | struct vtw_ctl *ctl; |
| 388 | } vestigial_inpcb_t; |
| 389 | |
| 390 | #ifdef _KERNEL |
| 391 | void vtw_restart(vestigial_inpcb_t*); |
| 392 | int vtw_earlyinit(void); |
| 393 | int sysctl_tcp_vtw_enable(SYSCTLFN_PROTO); |
| 394 | #endif /* _KERNEL */ |
| 395 | |
| 396 | #ifdef VTW_DEBUG |
| 397 | typedef struct sin_either { |
| 398 | uint8_t sin_len; |
| 399 | uint8_t sin_family; |
| 400 | uint16_t sin_port; |
| 401 | union { |
| 402 | struct in_addr v4; |
| 403 | struct in6_addr v6; |
| 404 | } sin_addr; |
| 405 | } sin_either_t; |
| 406 | |
| 407 | int vtw_debug_add(int af, sin_either_t *, sin_either_t *, int, int); |
| 408 | |
| 409 | typedef struct vtw_sysargs { |
| 410 | uint32_t op; |
| 411 | sin_either_t fa; |
| 412 | sin_either_t la; |
| 413 | } vtw_sysargs_t; |
| 414 | |
| 415 | #endif /* VTW_DEBUG */ |
| 416 | |
| 417 | #endif /* _NETINET_TCP_VTW_H */ |
| 418 | |