| 1 | /* $NetBSD: machdep.c,v 1.233 2016/11/17 16:26:08 maxv Exp $ */ |
| 2 | |
| 3 | /*- |
| 4 | * Copyright (c) 1996, 1997, 1998, 2000, 2006, 2007, 2008, 2011 |
| 5 | * The NetBSD Foundation, Inc. |
| 6 | * All rights reserved. |
| 7 | * |
| 8 | * This code is derived from software contributed to The NetBSD Foundation |
| 9 | * by Charles M. Hannum and by Jason R. Thorpe of the Numerical Aerospace |
| 10 | * Simulation Facility, NASA Ames Research Center. |
| 11 | * |
| 12 | * This code is derived from software contributed to The NetBSD Foundation |
| 13 | * by Coyote Point Systems, Inc. which was written under contract to Coyote |
| 14 | * Point by Jed Davis and Devon O'Dell. |
| 15 | * |
| 16 | * Redistribution and use in source and binary forms, with or without |
| 17 | * modification, are permitted provided that the following conditions |
| 18 | * are met: |
| 19 | * 1. Redistributions of source code must retain the above copyright |
| 20 | * notice, this list of conditions and the following disclaimer. |
| 21 | * 2. Redistributions in binary form must reproduce the above copyright |
| 22 | * notice, this list of conditions and the following disclaimer in the |
| 23 | * documentation and/or other materials provided with the distribution. |
| 24 | * |
| 25 | * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS |
| 26 | * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED |
| 27 | * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
| 28 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS |
| 29 | * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
| 30 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
| 31 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
| 32 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
| 33 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
| 34 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
| 35 | * POSSIBILITY OF SUCH DAMAGE. |
| 36 | */ |
| 37 | |
| 38 | /* |
| 39 | * Copyright (c) 2006 Mathieu Ropert <mro@adviseo.fr> |
| 40 | * |
| 41 | * Permission to use, copy, modify, and distribute this software for any |
| 42 | * purpose with or without fee is hereby granted, provided that the above |
| 43 | * copyright notice and this permission notice appear in all copies. |
| 44 | * |
| 45 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES |
| 46 | * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF |
| 47 | * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR |
| 48 | * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES |
| 49 | * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN |
| 50 | * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF |
| 51 | * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. |
| 52 | */ |
| 53 | |
| 54 | /* |
| 55 | * Copyright (c) 2007 Manuel Bouyer. |
| 56 | * |
| 57 | * Redistribution and use in source and binary forms, with or without |
| 58 | * modification, are permitted provided that the following conditions |
| 59 | * are met: |
| 60 | * 1. Redistributions of source code must retain the above copyright |
| 61 | * notice, this list of conditions and the following disclaimer. |
| 62 | * 2. Redistributions in binary form must reproduce the above copyright |
| 63 | * notice, this list of conditions and the following disclaimer in the |
| 64 | * documentation and/or other materials provided with the distribution. |
| 65 | * |
| 66 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR |
| 67 | * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES |
| 68 | * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. |
| 69 | * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, |
| 70 | * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT |
| 71 | * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| 72 | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| 73 | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| 74 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF |
| 75 | * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| 76 | * |
| 77 | */ |
| 78 | |
| 79 | /*- |
| 80 | * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. |
| 81 | * All rights reserved. |
| 82 | * |
| 83 | * This code is derived from software contributed to Berkeley by |
| 84 | * William Jolitz. |
| 85 | * |
| 86 | * Redistribution and use in source and binary forms, with or without |
| 87 | * modification, are permitted provided that the following conditions |
| 88 | * are met: |
| 89 | * 1. Redistributions of source code must retain the above copyright |
| 90 | * notice, this list of conditions and the following disclaimer. |
| 91 | * 2. Redistributions in binary form must reproduce the above copyright |
| 92 | * notice, this list of conditions and the following disclaimer in the |
| 93 | * documentation and/or other materials provided with the distribution. |
| 94 | * 3. Neither the name of the University nor the names of its contributors |
| 95 | * may be used to endorse or promote products derived from this software |
| 96 | * without specific prior written permission. |
| 97 | * |
| 98 | * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND |
| 99 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| 100 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| 101 | * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE |
| 102 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
| 103 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
| 104 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
| 105 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
| 106 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
| 107 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
| 108 | * SUCH DAMAGE. |
| 109 | * |
| 110 | * @(#)machdep.c 7.4 (Berkeley) 6/3/91 |
| 111 | */ |
| 112 | |
| 113 | #include <sys/cdefs.h> |
| 114 | __KERNEL_RCSID(0, "$NetBSD: machdep.c,v 1.233 2016/11/17 16:26:08 maxv Exp $" ); |
| 115 | |
| 116 | /* #define XENDEBUG_LOW */ |
| 117 | |
| 118 | #include "opt_modular.h" |
| 119 | #include "opt_user_ldt.h" |
| 120 | #include "opt_ddb.h" |
| 121 | #include "opt_kgdb.h" |
| 122 | #include "opt_cpureset_delay.h" |
| 123 | #include "opt_mtrr.h" |
| 124 | #include "opt_realmem.h" |
| 125 | #include "opt_xen.h" |
| 126 | #ifndef XEN |
| 127 | #include "opt_physmem.h" |
| 128 | #endif |
| 129 | #include "isa.h" |
| 130 | #include "pci.h" |
| 131 | |
| 132 | #include <sys/param.h> |
| 133 | #include <sys/systm.h> |
| 134 | #include <sys/signal.h> |
| 135 | #include <sys/signalvar.h> |
| 136 | #include <sys/kernel.h> |
| 137 | #include <sys/cpu.h> |
| 138 | #include <sys/exec.h> |
| 139 | #include <sys/exec_aout.h> /* for MID_* */ |
| 140 | #include <sys/reboot.h> |
| 141 | #include <sys/conf.h> |
| 142 | #include <sys/mbuf.h> |
| 143 | #include <sys/msgbuf.h> |
| 144 | #include <sys/mount.h> |
| 145 | #include <sys/core.h> |
| 146 | #include <sys/kcore.h> |
| 147 | #include <sys/ucontext.h> |
| 148 | #include <machine/kcore.h> |
| 149 | #include <sys/ras.h> |
| 150 | #include <sys/syscallargs.h> |
| 151 | #include <sys/ksyms.h> |
| 152 | #include <sys/device.h> |
| 153 | #include <sys/lwp.h> |
| 154 | #include <sys/proc.h> |
| 155 | |
| 156 | #ifdef KGDB |
| 157 | #include <sys/kgdb.h> |
| 158 | #endif |
| 159 | |
| 160 | #include <dev/cons.h> |
| 161 | #include <dev/mm.h> |
| 162 | |
| 163 | #include <uvm/uvm.h> |
| 164 | #include <uvm/uvm_page.h> |
| 165 | |
| 166 | #include <sys/sysctl.h> |
| 167 | |
| 168 | #include <machine/cpu.h> |
| 169 | #include <machine/cpufunc.h> |
| 170 | #include <machine/gdt.h> |
| 171 | #include <machine/intr.h> |
| 172 | #include <machine/pio.h> |
| 173 | #include <machine/psl.h> |
| 174 | #include <machine/reg.h> |
| 175 | #include <machine/specialreg.h> |
| 176 | #include <machine/bootinfo.h> |
| 177 | #include <x86/fpu.h> |
| 178 | #include <machine/mtrr.h> |
| 179 | #include <machine/mpbiosvar.h> |
| 180 | |
| 181 | #include <x86/cputypes.h> |
| 182 | #include <x86/cpuvar.h> |
| 183 | #include <x86/machdep.h> |
| 184 | |
| 185 | #include <x86/x86/tsc.h> |
| 186 | |
| 187 | #include <dev/isa/isareg.h> |
| 188 | #include <machine/isa_machdep.h> |
| 189 | #include <dev/ic/i8042reg.h> |
| 190 | |
| 191 | #ifdef XEN |
| 192 | #include <xen/xen.h> |
| 193 | #include <xen/hypervisor.h> |
| 194 | #include <xen/evtchn.h> |
| 195 | #endif |
| 196 | |
| 197 | #ifdef DDB |
| 198 | #include <machine/db_machdep.h> |
| 199 | #include <ddb/db_extern.h> |
| 200 | #include <ddb/db_output.h> |
| 201 | #include <ddb/db_interface.h> |
| 202 | #endif |
| 203 | |
| 204 | #include "acpica.h" |
| 205 | |
| 206 | #if NACPICA > 0 |
| 207 | #include <dev/acpi/acpivar.h> |
| 208 | #define ACPI_MACHDEP_PRIVATE |
| 209 | #include <machine/acpi_machdep.h> |
| 210 | #endif |
| 211 | |
| 212 | #include "isa.h" |
| 213 | #include "isadma.h" |
| 214 | #include "ksyms.h" |
| 215 | |
| 216 | /* the following is used externally (sysctl_hw) */ |
| 217 | char machine[] = "amd64" ; /* CPU "architecture" */ |
| 218 | char machine_arch[] = "x86_64" ; /* machine == machine_arch */ |
| 219 | |
| 220 | #ifdef CPURESET_DELAY |
| 221 | int cpureset_delay = CPURESET_DELAY; |
| 222 | #else |
| 223 | int cpureset_delay = 2000; /* default to 2s */ |
| 224 | #endif |
| 225 | |
| 226 | int cpu_class = CPUCLASS_686; |
| 227 | |
| 228 | #ifdef MTRR |
| 229 | struct mtrr_funcs *mtrr_funcs; |
| 230 | #endif |
| 231 | |
| 232 | uint64_t dumpmem_low; |
| 233 | uint64_t dumpmem_high; |
| 234 | int cpu_class; |
| 235 | int use_pae; |
| 236 | |
| 237 | #ifndef NO_SPARSE_DUMP |
| 238 | int sparse_dump = 1; |
| 239 | |
| 240 | paddr_t max_paddr = 0; |
| 241 | unsigned char *sparse_dump_physmap; |
| 242 | #endif |
| 243 | |
| 244 | char *, *; |
| 245 | #define PAGE_SIZE |
| 246 | #define (dump_headerbuf + dump_headerbuf_size) |
| 247 | #define (dump_headerbuf_end - dump_headerbuf_ptr) |
| 248 | daddr_t ; |
| 249 | |
| 250 | size_t dump_nmemsegs; |
| 251 | size_t dump_npages; |
| 252 | size_t ; |
| 253 | size_t dump_totalbytesleft; |
| 254 | |
| 255 | vaddr_t msgbuf_vaddr; |
| 256 | |
| 257 | struct { |
| 258 | paddr_t paddr; |
| 259 | psize_t sz; |
| 260 | } msgbuf_p_seg[VM_PHYSSEG_MAX]; |
| 261 | unsigned int msgbuf_p_cnt = 0; |
| 262 | |
| 263 | vaddr_t idt_vaddr; |
| 264 | paddr_t idt_paddr; |
| 265 | vaddr_t gdt_vaddr; |
| 266 | paddr_t gdt_paddr; |
| 267 | vaddr_t ldt_vaddr; |
| 268 | paddr_t ldt_paddr; |
| 269 | |
| 270 | vaddr_t module_start, module_end; |
| 271 | static struct vm_map module_map_store; |
| 272 | extern struct vm_map *module_map; |
| 273 | vaddr_t kern_end; |
| 274 | |
| 275 | struct vm_map *phys_map = NULL; |
| 276 | |
| 277 | extern paddr_t avail_start, avail_end; |
| 278 | #ifdef XEN |
| 279 | extern paddr_t pmap_pa_start, pmap_pa_end; |
| 280 | #endif |
| 281 | |
| 282 | #ifndef XEN |
| 283 | void (*delay_func)(unsigned int) = i8254_delay; |
| 284 | void (*initclock_func)(void) = i8254_initclocks; |
| 285 | #else /* XEN */ |
| 286 | void (*delay_func)(unsigned int) = xen_delay; |
| 287 | void (*initclock_func)(void) = xen_initclocks; |
| 288 | #endif |
| 289 | |
| 290 | |
| 291 | /* |
| 292 | * Size of memory segments, before any memory is stolen. |
| 293 | */ |
| 294 | phys_ram_seg_t mem_clusters[VM_PHYSSEG_MAX]; |
| 295 | int mem_cluster_cnt; |
| 296 | |
| 297 | char x86_64_doubleflt_stack[4096]; |
| 298 | |
| 299 | int cpu_dump(void); |
| 300 | int cpu_dumpsize(void); |
| 301 | u_long cpu_dump_mempagecnt(void); |
| 302 | void dodumpsys(void); |
| 303 | void dumpsys(void); |
| 304 | |
| 305 | extern int time_adjusted; /* XXX no common header */ |
| 306 | |
| 307 | void dump_misc_init(void); |
| 308 | void dump_seg_prep(void); |
| 309 | int dump_seg_iter(int (*)(paddr_t, paddr_t)); |
| 310 | |
| 311 | #ifndef NO_SPARSE_DUMP |
| 312 | void sparse_dump_reset(void); |
| 313 | void sparse_dump_mark(void); |
| 314 | void cpu_dump_prep_sparse(void); |
| 315 | #endif |
| 316 | |
| 317 | void dump_header_start(void); |
| 318 | int dump_header_flush(void); |
| 319 | int dump_header_addbytes(const void*, size_t); |
| 320 | int dump_header_addseg(paddr_t, paddr_t); |
| 321 | int dump_header_finish(void); |
| 322 | |
| 323 | int dump_seg_count_range(paddr_t, paddr_t); |
| 324 | int dumpsys_seg(paddr_t, paddr_t); |
| 325 | |
| 326 | void init_x86_64(paddr_t); |
| 327 | |
| 328 | static int valid_user_selector(struct lwp *, uint64_t); |
| 329 | |
| 330 | /* |
| 331 | * Machine-dependent startup code |
| 332 | */ |
| 333 | void |
| 334 | cpu_startup(void) |
| 335 | { |
| 336 | int x, y; |
| 337 | vaddr_t minaddr, maxaddr; |
| 338 | psize_t sz; |
| 339 | |
| 340 | /* |
| 341 | * For console drivers that require uvm and pmap to be initialized, |
| 342 | * we'll give them one more chance here... |
| 343 | */ |
| 344 | consinit(); |
| 345 | |
| 346 | /* |
| 347 | * Initialize error message buffer (et end of core). |
| 348 | */ |
| 349 | if (msgbuf_p_cnt == 0) |
| 350 | panic("msgbuf paddr map has not been set up" ); |
| 351 | for (x = 0, sz = 0; x < msgbuf_p_cnt; sz += msgbuf_p_seg[x++].sz) |
| 352 | continue; |
| 353 | |
| 354 | msgbuf_vaddr = uvm_km_alloc(kernel_map, sz, 0, UVM_KMF_VAONLY); |
| 355 | if (msgbuf_vaddr == 0) |
| 356 | panic("failed to valloc msgbuf_vaddr" ); |
| 357 | |
| 358 | for (y = 0, sz = 0; y < msgbuf_p_cnt; y++) { |
| 359 | for (x = 0; x < btoc(msgbuf_p_seg[y].sz); x++, sz += PAGE_SIZE) |
| 360 | pmap_kenter_pa((vaddr_t)msgbuf_vaddr + sz, |
| 361 | msgbuf_p_seg[y].paddr + x * PAGE_SIZE, |
| 362 | VM_PROT_READ|VM_PROT_WRITE, 0); |
| 363 | } |
| 364 | |
| 365 | pmap_update(pmap_kernel()); |
| 366 | |
| 367 | initmsgbuf((void *)msgbuf_vaddr, round_page(sz)); |
| 368 | |
| 369 | minaddr = 0; |
| 370 | |
| 371 | /* |
| 372 | * Allocate a submap for physio. |
| 373 | */ |
| 374 | phys_map = uvm_km_suballoc(kernel_map, &minaddr, &maxaddr, |
| 375 | VM_PHYS_SIZE, 0, false, NULL); |
| 376 | |
| 377 | /* |
| 378 | * Create the module map. |
| 379 | * |
| 380 | * The kernel uses RIP-relative addressing with a maximum offset of |
| 381 | * 2GB. The problem is, kernel_map is too far away in memory from |
| 382 | * the kernel .text. So we cannot use it, and have to create a |
| 383 | * special module_map. |
| 384 | * |
| 385 | * The module map is taken as what is left of the bootstrap memory |
| 386 | * created in locore.S. This memory is right above the kernel |
| 387 | * image, so this is the best place to put our modules. |
| 388 | */ |
| 389 | uvm_map_setup(&module_map_store, module_start, module_end, 0); |
| 390 | module_map_store.pmap = pmap_kernel(); |
| 391 | module_map = &module_map_store; |
| 392 | |
| 393 | /* Say hello. */ |
| 394 | banner(); |
| 395 | |
| 396 | #if NISA > 0 || NPCI > 0 |
| 397 | /* Safe for i/o port / memory space allocation to use malloc now. */ |
| 398 | x86_bus_space_mallocok(); |
| 399 | #endif |
| 400 | |
| 401 | gdt_init(); |
| 402 | x86_64_proc0_tss_ldt_init(); |
| 403 | |
| 404 | cpu_init_tss(&cpu_info_primary); |
| 405 | #if !defined(XEN) |
| 406 | ltr(cpu_info_primary.ci_tss_sel); |
| 407 | #endif /* !defined(XEN) */ |
| 408 | |
| 409 | x86_startup(); |
| 410 | } |
| 411 | |
| 412 | #ifdef XEN |
| 413 | /* used in assembly */ |
| 414 | void hypervisor_callback(void); |
| 415 | void failsafe_callback(void); |
| 416 | void x86_64_switch_context(struct pcb *); |
| 417 | void x86_64_tls_switch(struct lwp *); |
| 418 | |
| 419 | void |
| 420 | x86_64_switch_context(struct pcb *new) |
| 421 | { |
| 422 | HYPERVISOR_stack_switch(GSEL(GDATA_SEL, SEL_KPL), new->pcb_rsp0); |
| 423 | struct physdev_op physop; |
| 424 | physop.cmd = PHYSDEVOP_SET_IOPL; |
| 425 | physop.u.set_iopl.iopl = new->pcb_iopl; |
| 426 | HYPERVISOR_physdev_op(&physop); |
| 427 | } |
| 428 | |
| 429 | void |
| 430 | x86_64_tls_switch(struct lwp *l) |
| 431 | { |
| 432 | struct cpu_info *ci = curcpu(); |
| 433 | struct pcb *pcb = lwp_getpcb(l); |
| 434 | struct trapframe *tf = l->l_md.md_regs; |
| 435 | |
| 436 | /* |
| 437 | * Raise the IPL to IPL_HIGH. |
| 438 | * FPU IPIs can alter the LWP's saved cr0. Dropping the priority |
| 439 | * is deferred until mi_switch(), when cpu_switchto() returns. |
| 440 | */ |
| 441 | (void)splhigh(); |
| 442 | /* |
| 443 | * If our floating point registers are on a different CPU, |
| 444 | * set CR0_TS so we'll trap rather than reuse bogus state. |
| 445 | */ |
| 446 | if (l != ci->ci_fpcurlwp) { |
| 447 | HYPERVISOR_fpu_taskswitch(1); |
| 448 | } |
| 449 | |
| 450 | /* Update TLS segment pointers */ |
| 451 | if (pcb->pcb_flags & PCB_COMPAT32) { |
| 452 | update_descriptor(&curcpu()->ci_gdt[GUFS_SEL], &pcb->pcb_fs); |
| 453 | update_descriptor(&curcpu()->ci_gdt[GUGS_SEL], &pcb->pcb_gs); |
| 454 | setfs(tf->tf_fs); |
| 455 | HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, tf->tf_gs); |
| 456 | } else { |
| 457 | setfs(0); |
| 458 | HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, 0); |
| 459 | HYPERVISOR_set_segment_base(SEGBASE_FS, pcb->pcb_fs); |
| 460 | HYPERVISOR_set_segment_base(SEGBASE_GS_USER, pcb->pcb_gs); |
| 461 | } |
| 462 | } |
| 463 | #endif /* XEN */ |
| 464 | |
| 465 | /* |
| 466 | * Set up proc0's TSS and LDT. |
| 467 | */ |
| 468 | void |
| 469 | x86_64_proc0_tss_ldt_init(void) |
| 470 | { |
| 471 | struct lwp *l = &lwp0; |
| 472 | struct pcb *pcb = lwp_getpcb(l); |
| 473 | |
| 474 | pcb->pcb_flags = 0; |
| 475 | pcb->pcb_fs = 0; |
| 476 | pcb->pcb_gs = 0; |
| 477 | pcb->pcb_rsp0 = (uvm_lwp_getuarea(l) + USPACE - 16) & ~0xf; |
| 478 | pcb->pcb_iopl = SEL_KPL; |
| 479 | |
| 480 | pmap_kernel()->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL); |
| 481 | pcb->pcb_cr0 = rcr0() & ~CR0_TS; |
| 482 | l->l_md.md_regs = (struct trapframe *)pcb->pcb_rsp0 - 1; |
| 483 | |
| 484 | #if !defined(XEN) |
| 485 | lldt(pmap_kernel()->pm_ldt_sel); |
| 486 | #else |
| 487 | { |
| 488 | struct physdev_op physop; |
| 489 | xen_set_ldt((vaddr_t) ldtstore, LDT_SIZE >> 3); |
| 490 | /* Reset TS bit and set kernel stack for interrupt handlers */ |
| 491 | HYPERVISOR_fpu_taskswitch(1); |
| 492 | HYPERVISOR_stack_switch(GSEL(GDATA_SEL, SEL_KPL), pcb->pcb_rsp0); |
| 493 | physop.cmd = PHYSDEVOP_SET_IOPL; |
| 494 | physop.u.set_iopl.iopl = pcb->pcb_iopl; |
| 495 | HYPERVISOR_physdev_op(&physop); |
| 496 | } |
| 497 | #endif /* XEN */ |
| 498 | } |
| 499 | |
| 500 | /* |
| 501 | * Set up TSS and I/O bitmap. |
| 502 | */ |
| 503 | void |
| 504 | cpu_init_tss(struct cpu_info *ci) |
| 505 | { |
| 506 | struct x86_64_tss *tss = &ci->ci_tss; |
| 507 | uintptr_t p; |
| 508 | |
| 509 | tss->tss_iobase = IOMAP_INVALOFF << 16; |
| 510 | /* tss->tss_ist[0] is filled by cpu_intr_init */ |
| 511 | |
| 512 | /* double fault */ |
| 513 | tss->tss_ist[1] = (uint64_t)x86_64_doubleflt_stack + PAGE_SIZE - 16; |
| 514 | |
| 515 | /* NMI */ |
| 516 | p = uvm_km_alloc(kernel_map, PAGE_SIZE, 0, UVM_KMF_WIRED); |
| 517 | tss->tss_ist[2] = p + PAGE_SIZE - 16; |
| 518 | ci->ci_tss_sel = tss_alloc(tss); |
| 519 | } |
| 520 | |
| 521 | void |
| 522 | buildcontext(struct lwp *l, void *catcher, void *f) |
| 523 | { |
| 524 | struct trapframe *tf = l->l_md.md_regs; |
| 525 | |
| 526 | tf->tf_ds = GSEL(GUDATA_SEL, SEL_UPL); |
| 527 | tf->tf_es = GSEL(GUDATA_SEL, SEL_UPL); |
| 528 | tf->tf_fs = GSEL(GUDATA_SEL, SEL_UPL); |
| 529 | tf->tf_gs = GSEL(GUDATA_SEL, SEL_UPL); |
| 530 | |
| 531 | tf->tf_rip = (uint64_t)catcher; |
| 532 | tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL); |
| 533 | tf->tf_rflags &= ~PSL_CLEARSIG; |
| 534 | tf->tf_rsp = (uint64_t)f; |
| 535 | tf->tf_ss = GSEL(GUDATA_SEL, SEL_UPL); |
| 536 | |
| 537 | /* Ensure FP state is sane */ |
| 538 | fpu_save_area_reset(l); |
| 539 | } |
| 540 | |
| 541 | void |
| 542 | sendsig_sigcontext(const ksiginfo_t *ksi, const sigset_t *mask) |
| 543 | { |
| 544 | |
| 545 | printf("sendsig_sigcontext: illegal\n" ); |
| 546 | sigexit(curlwp, SIGILL); |
| 547 | } |
| 548 | |
| 549 | void |
| 550 | sendsig_siginfo(const ksiginfo_t *ksi, const sigset_t *mask) |
| 551 | { |
| 552 | struct lwp *l = curlwp; |
| 553 | struct proc *p = l->l_proc; |
| 554 | struct sigacts *ps = p->p_sigacts; |
| 555 | int onstack, error; |
| 556 | int sig = ksi->ksi_signo; |
| 557 | struct sigframe_siginfo *fp, frame; |
| 558 | sig_t catcher = SIGACTION(p, sig).sa_handler; |
| 559 | struct trapframe *tf = l->l_md.md_regs; |
| 560 | char *sp; |
| 561 | |
| 562 | KASSERT(mutex_owned(p->p_lock)); |
| 563 | |
| 564 | /* Do we need to jump onto the signal stack? */ |
| 565 | onstack = |
| 566 | (l->l_sigstk.ss_flags & (SS_DISABLE | SS_ONSTACK)) == 0 && |
| 567 | (SIGACTION(p, sig).sa_flags & SA_ONSTACK) != 0; |
| 568 | |
| 569 | /* Allocate space for the signal handler context. */ |
| 570 | if (onstack) |
| 571 | sp = ((char *)l->l_sigstk.ss_sp + l->l_sigstk.ss_size); |
| 572 | else |
| 573 | /* AMD64 ABI 128-bytes "red zone". */ |
| 574 | sp = (char *)tf->tf_rsp - 128; |
| 575 | |
| 576 | sp -= sizeof(struct sigframe_siginfo); |
| 577 | /* Round down the stackpointer to a multiple of 16 for the ABI. */ |
| 578 | fp = (struct sigframe_siginfo *)(((unsigned long)sp & ~15) - 8); |
| 579 | |
| 580 | frame.sf_ra = (uint64_t)ps->sa_sigdesc[sig].sd_tramp; |
| 581 | frame.sf_si._info = ksi->ksi_info; |
| 582 | frame.sf_uc.uc_flags = _UC_SIGMASK; |
| 583 | frame.sf_uc.uc_sigmask = *mask; |
| 584 | frame.sf_uc.uc_link = l->l_ctxlink; |
| 585 | frame.sf_uc.uc_flags |= (l->l_sigstk.ss_flags & SS_ONSTACK) |
| 586 | ? _UC_SETSTACK : _UC_CLRSTACK; |
| 587 | memset(&frame.sf_uc.uc_stack, 0, sizeof(frame.sf_uc.uc_stack)); |
| 588 | sendsig_reset(l, sig); |
| 589 | |
| 590 | mutex_exit(p->p_lock); |
| 591 | cpu_getmcontext(l, &frame.sf_uc.uc_mcontext, &frame.sf_uc.uc_flags); |
| 592 | /* Copyout all the fp regs, the signal handler might expect them. */ |
| 593 | error = copyout(&frame, fp, sizeof frame); |
| 594 | mutex_enter(p->p_lock); |
| 595 | |
| 596 | if (error != 0) { |
| 597 | /* |
| 598 | * Process has trashed its stack; give it an illegal |
| 599 | * instruction to halt it in its tracks. |
| 600 | */ |
| 601 | sigexit(l, SIGILL); |
| 602 | /* NOTREACHED */ |
| 603 | } |
| 604 | |
| 605 | buildcontext(l, catcher, fp); |
| 606 | |
| 607 | tf->tf_rdi = sig; |
| 608 | tf->tf_rsi = (uint64_t)&fp->sf_si; |
| 609 | tf->tf_rdx = tf->tf_r15 = (uint64_t)&fp->sf_uc; |
| 610 | |
| 611 | /* Remember that we're now on the signal stack. */ |
| 612 | if (onstack) |
| 613 | l->l_sigstk.ss_flags |= SS_ONSTACK; |
| 614 | |
| 615 | if ((vaddr_t)catcher >= VM_MAXUSER_ADDRESS) { |
| 616 | /* |
| 617 | * process has given an invalid address for the |
| 618 | * handler. Stop it, but do not do it before so |
| 619 | * we can return the right info to userland (or in core dump) |
| 620 | */ |
| 621 | sigexit(l, SIGILL); |
| 622 | /* NOTREACHED */ |
| 623 | } |
| 624 | } |
| 625 | |
| 626 | struct pcb dumppcb; |
| 627 | |
| 628 | void |
| 629 | cpu_reboot(int howto, char *bootstr) |
| 630 | { |
| 631 | static bool syncdone = false; |
| 632 | int s = IPL_NONE; |
| 633 | __USE(s); /* ugly otherwise */ |
| 634 | |
| 635 | if (cold) { |
| 636 | howto |= RB_HALT; |
| 637 | goto haltsys; |
| 638 | } |
| 639 | |
| 640 | boothowto = howto; |
| 641 | |
| 642 | /* i386 maybe_dump() */ |
| 643 | |
| 644 | /* |
| 645 | * If we've panic'd, don't make the situation potentially |
| 646 | * worse by syncing or unmounting the file systems. |
| 647 | */ |
| 648 | if ((howto & RB_NOSYNC) == 0 && panicstr == NULL) { |
| 649 | if (!syncdone) { |
| 650 | syncdone = true; |
| 651 | /* XXX used to force unmount as well, here */ |
| 652 | vfs_sync_all(curlwp); |
| 653 | /* |
| 654 | * If we've been adjusting the clock, the todr |
| 655 | * will be out of synch; adjust it now. |
| 656 | * |
| 657 | * XXX used to do this after unmounting all |
| 658 | * filesystems with vfs_shutdown(). |
| 659 | */ |
| 660 | if (time_adjusted != 0) |
| 661 | resettodr(); |
| 662 | } |
| 663 | |
| 664 | while (vfs_unmountall1(curlwp, false, false) || |
| 665 | config_detach_all(boothowto) || |
| 666 | vfs_unmount_forceone(curlwp)) |
| 667 | ; /* do nothing */ |
| 668 | } else |
| 669 | suspendsched(); |
| 670 | |
| 671 | pmf_system_shutdown(boothowto); |
| 672 | |
| 673 | /* Disable interrupts. */ |
| 674 | s = splhigh(); |
| 675 | |
| 676 | /* Do a dump if requested. */ |
| 677 | if ((howto & (RB_DUMP | RB_HALT)) == RB_DUMP) |
| 678 | dumpsys(); |
| 679 | |
| 680 | haltsys: |
| 681 | doshutdownhooks(); |
| 682 | |
| 683 | if ((howto & RB_POWERDOWN) == RB_POWERDOWN) { |
| 684 | #if NACPICA > 0 |
| 685 | if (s != IPL_NONE) |
| 686 | splx(s); |
| 687 | |
| 688 | acpi_enter_sleep_state(ACPI_STATE_S5); |
| 689 | #endif |
| 690 | #ifdef XEN |
| 691 | HYPERVISOR_shutdown(); |
| 692 | #endif /* XEN */ |
| 693 | } |
| 694 | |
| 695 | cpu_broadcast_halt(); |
| 696 | |
| 697 | if (howto & RB_HALT) { |
| 698 | #if NACPICA > 0 |
| 699 | acpi_disable(); |
| 700 | #endif |
| 701 | |
| 702 | printf("\n" ); |
| 703 | printf("The operating system has halted.\n" ); |
| 704 | printf("Please press any key to reboot.\n\n" ); |
| 705 | cnpollc(1); /* for proper keyboard command handling */ |
| 706 | if (cngetc() == 0) { |
| 707 | /* no console attached, so just hlt */ |
| 708 | printf("No keyboard - cannot reboot after all.\n" ); |
| 709 | for(;;) { |
| 710 | x86_hlt(); |
| 711 | } |
| 712 | } |
| 713 | cnpollc(0); |
| 714 | } |
| 715 | |
| 716 | printf("rebooting...\n" ); |
| 717 | if (cpureset_delay > 0) |
| 718 | delay(cpureset_delay * 1000); |
| 719 | cpu_reset(); |
| 720 | for(;;) ; |
| 721 | /*NOTREACHED*/ |
| 722 | } |
| 723 | |
| 724 | /* |
| 725 | * XXXfvdl share dumpcode. |
| 726 | */ |
| 727 | |
| 728 | /* |
| 729 | * Perform assorted dump-related initialization tasks. Assumes that |
| 730 | * the maximum physical memory address will not increase afterwards. |
| 731 | */ |
| 732 | void |
| 733 | dump_misc_init(void) |
| 734 | { |
| 735 | #ifndef NO_SPARSE_DUMP |
| 736 | int i; |
| 737 | #endif |
| 738 | |
| 739 | if (dump_headerbuf != NULL) |
| 740 | return; /* already called */ |
| 741 | |
| 742 | #ifndef NO_SPARSE_DUMP |
| 743 | for (i = 0; i < mem_cluster_cnt; ++i) { |
| 744 | paddr_t top = mem_clusters[i].start + mem_clusters[i].size; |
| 745 | if (max_paddr < top) |
| 746 | max_paddr = top; |
| 747 | } |
| 748 | #ifdef DEBUG |
| 749 | printf("dump_misc_init: max_paddr = 0x%lx\n" , |
| 750 | (unsigned long)max_paddr); |
| 751 | #endif |
| 752 | if (max_paddr == 0) { |
| 753 | printf("Your machine does not initialize mem_clusters; " |
| 754 | "sparse_dumps disabled\n" ); |
| 755 | sparse_dump = 0; |
| 756 | } else { |
| 757 | sparse_dump_physmap = (void *)uvm_km_alloc(kernel_map, |
| 758 | roundup(max_paddr / (PAGE_SIZE * NBBY), PAGE_SIZE), |
| 759 | PAGE_SIZE, UVM_KMF_WIRED|UVM_KMF_ZERO); |
| 760 | } |
| 761 | #endif |
| 762 | dump_headerbuf = (void *)uvm_km_alloc(kernel_map, |
| 763 | dump_headerbuf_size, |
| 764 | PAGE_SIZE, UVM_KMF_WIRED|UVM_KMF_ZERO); |
| 765 | /* XXXjld should check for failure here, disable dumps if so. */ |
| 766 | } |
| 767 | |
| 768 | #ifndef NO_SPARSE_DUMP |
| 769 | /* |
| 770 | * Clear the set of pages to include in a sparse dump. |
| 771 | */ |
| 772 | void |
| 773 | sparse_dump_reset(void) |
| 774 | { |
| 775 | memset(sparse_dump_physmap, 0, |
| 776 | roundup(max_paddr / (PAGE_SIZE * NBBY), PAGE_SIZE)); |
| 777 | } |
| 778 | |
| 779 | /* |
| 780 | * Include or exclude pages in a sparse dump. |
| 781 | */ |
| 782 | void |
| 783 | sparse_dump_mark(void) |
| 784 | { |
| 785 | paddr_t p, pstart, pend; |
| 786 | struct vm_page *pg; |
| 787 | int i; |
| 788 | |
| 789 | /* |
| 790 | * Mark all memory pages, then unmark pages that are uninteresting. |
| 791 | * Dereferenceing pg->uobject might crash again if another CPU |
| 792 | * frees the object out from under us, but we can't lock anything |
| 793 | * so it's a risk we have to take. |
| 794 | */ |
| 795 | |
| 796 | for (i = 0; i < mem_cluster_cnt; ++i) { |
| 797 | pstart = mem_clusters[i].start / PAGE_SIZE; |
| 798 | pend = pstart + mem_clusters[i].size / PAGE_SIZE; |
| 799 | |
| 800 | for (p = pstart; p < pend; p++) { |
| 801 | setbit(sparse_dump_physmap, p); |
| 802 | } |
| 803 | } |
| 804 | for (i = 0; i < vm_nphysseg; i++) { |
| 805 | struct vm_physseg *seg = VM_PHYSMEM_PTR(i); |
| 806 | |
| 807 | for (pg = seg->pgs; pg < seg->lastpg; pg++) { |
| 808 | if (pg->uanon || (pg->pqflags & PQ_FREE) || |
| 809 | (pg->uobject && pg->uobject->pgops)) { |
| 810 | p = VM_PAGE_TO_PHYS(pg) / PAGE_SIZE; |
| 811 | clrbit(sparse_dump_physmap, p); |
| 812 | } |
| 813 | } |
| 814 | } |
| 815 | } |
| 816 | |
| 817 | /* |
| 818 | * Machine-dependently decides on the contents of a sparse dump, using |
| 819 | * the above. |
| 820 | */ |
| 821 | void |
| 822 | cpu_dump_prep_sparse(void) |
| 823 | { |
| 824 | sparse_dump_reset(); |
| 825 | /* XXX could the alternate recursive page table be skipped? */ |
| 826 | sparse_dump_mark(); |
| 827 | /* Memory for I/O buffers could be unmarked here, for example. */ |
| 828 | /* The kernel text could also be unmarked, but gdb would be upset. */ |
| 829 | } |
| 830 | #endif |
| 831 | |
| 832 | /* |
| 833 | * Abstractly iterate over the collection of memory segments to be |
| 834 | * dumped; the callback lacks the customary environment-pointer |
| 835 | * argument because none of the current users really need one. |
| 836 | * |
| 837 | * To be used only after dump_seg_prep is called to set things up. |
| 838 | */ |
| 839 | int |
| 840 | dump_seg_iter(int (*callback)(paddr_t, paddr_t)) |
| 841 | { |
| 842 | int error, i; |
| 843 | |
| 844 | #define CALLBACK(start,size) do { \ |
| 845 | error = callback(start,size); \ |
| 846 | if (error) \ |
| 847 | return error; \ |
| 848 | } while(0) |
| 849 | |
| 850 | for (i = 0; i < mem_cluster_cnt; ++i) { |
| 851 | #ifndef NO_SPARSE_DUMP |
| 852 | /* |
| 853 | * The bitmap is scanned within each memory segment, |
| 854 | * rather than over its entire domain, in case any |
| 855 | * pages outside of the memory proper have been mapped |
| 856 | * into kva; they might be devices that wouldn't |
| 857 | * appreciate being arbitrarily read, and including |
| 858 | * them could also break the assumption that a sparse |
| 859 | * dump will always be smaller than a full one. |
| 860 | */ |
| 861 | if (sparse_dump && sparse_dump_physmap) { |
| 862 | paddr_t p, start, end; |
| 863 | int lastset; |
| 864 | |
| 865 | start = mem_clusters[i].start; |
| 866 | end = start + mem_clusters[i].size; |
| 867 | start = rounddown(start, PAGE_SIZE); /* unnecessary? */ |
| 868 | lastset = 0; |
| 869 | for (p = start; p < end; p += PAGE_SIZE) { |
| 870 | int thisset = isset(sparse_dump_physmap, |
| 871 | p/PAGE_SIZE); |
| 872 | |
| 873 | if (!lastset && thisset) |
| 874 | start = p; |
| 875 | if (lastset && !thisset) |
| 876 | CALLBACK(start, p - start); |
| 877 | lastset = thisset; |
| 878 | } |
| 879 | if (lastset) |
| 880 | CALLBACK(start, p - start); |
| 881 | } else |
| 882 | #endif |
| 883 | CALLBACK(mem_clusters[i].start, mem_clusters[i].size); |
| 884 | } |
| 885 | return 0; |
| 886 | #undef CALLBACK |
| 887 | } |
| 888 | |
| 889 | /* |
| 890 | * Prepare for an impending core dump: decide what's being dumped and |
| 891 | * how much space it will take up. |
| 892 | */ |
| 893 | void |
| 894 | dump_seg_prep(void) |
| 895 | { |
| 896 | #ifndef NO_SPARSE_DUMP |
| 897 | if (sparse_dump && sparse_dump_physmap) |
| 898 | cpu_dump_prep_sparse(); |
| 899 | #endif |
| 900 | |
| 901 | dump_nmemsegs = 0; |
| 902 | dump_npages = 0; |
| 903 | dump_seg_iter(dump_seg_count_range); |
| 904 | |
| 905 | dump_header_size = ALIGN(sizeof(kcore_seg_t)) + |
| 906 | ALIGN(sizeof(cpu_kcore_hdr_t)) + |
| 907 | ALIGN(dump_nmemsegs * sizeof(phys_ram_seg_t)); |
| 908 | dump_header_size = roundup(dump_header_size, dbtob(1)); |
| 909 | |
| 910 | /* |
| 911 | * savecore(8) will read this to decide how many pages to |
| 912 | * copy, and cpu_dumpconf has already used the pessimistic |
| 913 | * value to set dumplo, so it's time to tell the truth. |
| 914 | */ |
| 915 | dumpsize = dump_npages; /* XXX could these just be one variable? */ |
| 916 | } |
| 917 | |
| 918 | int |
| 919 | dump_seg_count_range(paddr_t start, paddr_t size) |
| 920 | { |
| 921 | ++dump_nmemsegs; |
| 922 | dump_npages += size / PAGE_SIZE; |
| 923 | return 0; |
| 924 | } |
| 925 | |
| 926 | /* |
| 927 | * A sparse dump's header may be rather large, due to the number of |
| 928 | * "segments" emitted. These routines manage a simple output buffer, |
| 929 | * so that the header can be written to disk incrementally. |
| 930 | */ |
| 931 | void |
| 932 | (void) |
| 933 | { |
| 934 | dump_headerbuf_ptr = dump_headerbuf; |
| 935 | dump_header_blkno = dumplo; |
| 936 | } |
| 937 | |
| 938 | int |
| 939 | (void) |
| 940 | { |
| 941 | const struct bdevsw *bdev; |
| 942 | size_t to_write; |
| 943 | int error; |
| 944 | |
| 945 | bdev = bdevsw_lookup(dumpdev); |
| 946 | to_write = roundup(dump_headerbuf_ptr - dump_headerbuf, dbtob(1)); |
| 947 | error = bdev->d_dump(dumpdev, dump_header_blkno, |
| 948 | dump_headerbuf, to_write); |
| 949 | dump_header_blkno += btodb(to_write); |
| 950 | dump_headerbuf_ptr = dump_headerbuf; |
| 951 | return error; |
| 952 | } |
| 953 | |
| 954 | int |
| 955 | (const void* vptr, size_t n) |
| 956 | { |
| 957 | const char* ptr = vptr; |
| 958 | int error; |
| 959 | |
| 960 | while (n > dump_headerbuf_avail) { |
| 961 | memcpy(dump_headerbuf_ptr, ptr, dump_headerbuf_avail); |
| 962 | ptr += dump_headerbuf_avail; |
| 963 | n -= dump_headerbuf_avail; |
| 964 | dump_headerbuf_ptr = dump_headerbuf_end; |
| 965 | error = dump_header_flush(); |
| 966 | if (error) |
| 967 | return error; |
| 968 | } |
| 969 | memcpy(dump_headerbuf_ptr, ptr, n); |
| 970 | dump_headerbuf_ptr += n; |
| 971 | |
| 972 | return 0; |
| 973 | } |
| 974 | |
| 975 | int |
| 976 | (paddr_t start, paddr_t size) |
| 977 | { |
| 978 | phys_ram_seg_t seg = { start, size }; |
| 979 | |
| 980 | return dump_header_addbytes(&seg, sizeof(seg)); |
| 981 | } |
| 982 | |
| 983 | int |
| 984 | (void) |
| 985 | { |
| 986 | memset(dump_headerbuf_ptr, 0, dump_headerbuf_avail); |
| 987 | return dump_header_flush(); |
| 988 | } |
| 989 | |
| 990 | |
| 991 | /* |
| 992 | * These variables are needed by /sbin/savecore |
| 993 | */ |
| 994 | uint32_t dumpmag = 0x8fca0101; /* magic number */ |
| 995 | int dumpsize = 0; /* pages */ |
| 996 | long dumplo = 0; /* blocks */ |
| 997 | |
| 998 | /* |
| 999 | * cpu_dumpsize: calculate size of machine-dependent kernel core dump headers |
| 1000 | * for a full (non-sparse) dump. |
| 1001 | */ |
| 1002 | int |
| 1003 | cpu_dumpsize(void) |
| 1004 | { |
| 1005 | int size; |
| 1006 | |
| 1007 | size = ALIGN(sizeof(kcore_seg_t)) + ALIGN(sizeof(cpu_kcore_hdr_t)) + |
| 1008 | ALIGN(mem_cluster_cnt * sizeof(phys_ram_seg_t)); |
| 1009 | if (roundup(size, dbtob(1)) != dbtob(1)) |
| 1010 | return (-1); |
| 1011 | |
| 1012 | return (1); |
| 1013 | } |
| 1014 | |
| 1015 | /* |
| 1016 | * cpu_dump_mempagecnt: calculate the size of RAM (in pages) to be dumped |
| 1017 | * for a full (non-sparse) dump. |
| 1018 | */ |
| 1019 | u_long |
| 1020 | cpu_dump_mempagecnt(void) |
| 1021 | { |
| 1022 | u_long i, n; |
| 1023 | |
| 1024 | n = 0; |
| 1025 | for (i = 0; i < mem_cluster_cnt; i++) |
| 1026 | n += atop(mem_clusters[i].size); |
| 1027 | return (n); |
| 1028 | } |
| 1029 | |
| 1030 | /* |
| 1031 | * cpu_dump: dump the machine-dependent kernel core dump headers. |
| 1032 | */ |
| 1033 | int |
| 1034 | cpu_dump(void) |
| 1035 | { |
| 1036 | kcore_seg_t seg; |
| 1037 | cpu_kcore_hdr_t cpuhdr; |
| 1038 | const struct bdevsw *bdev; |
| 1039 | |
| 1040 | bdev = bdevsw_lookup(dumpdev); |
| 1041 | if (bdev == NULL) |
| 1042 | return (ENXIO); |
| 1043 | |
| 1044 | /* |
| 1045 | * Generate a segment header. |
| 1046 | */ |
| 1047 | CORE_SETMAGIC(seg, KCORE_MAGIC, MID_MACHINE, CORE_CPU); |
| 1048 | seg.c_size = dump_header_size - ALIGN(sizeof(seg)); |
| 1049 | (void)dump_header_addbytes(&seg, ALIGN(sizeof(seg))); |
| 1050 | |
| 1051 | /* |
| 1052 | * Add the machine-dependent header info. |
| 1053 | */ |
| 1054 | cpuhdr.ptdpaddr = PDPpaddr; |
| 1055 | cpuhdr.nmemsegs = dump_nmemsegs; |
| 1056 | (void)dump_header_addbytes(&cpuhdr, ALIGN(sizeof(cpuhdr))); |
| 1057 | |
| 1058 | /* |
| 1059 | * Write out the memory segment descriptors. |
| 1060 | */ |
| 1061 | return dump_seg_iter(dump_header_addseg); |
| 1062 | } |
| 1063 | |
| 1064 | /* |
| 1065 | * Doadump comes here after turning off memory management and |
| 1066 | * getting on the dump stack, either when called above, or by |
| 1067 | * the auto-restart code. |
| 1068 | */ |
| 1069 | #define BYTES_PER_DUMP PAGE_SIZE /* must be a multiple of pagesize XXX small */ |
| 1070 | static vaddr_t dumpspace; |
| 1071 | |
| 1072 | vaddr_t |
| 1073 | reserve_dumppages(vaddr_t p) |
| 1074 | { |
| 1075 | |
| 1076 | dumpspace = p; |
| 1077 | return (p + BYTES_PER_DUMP); |
| 1078 | } |
| 1079 | |
| 1080 | int |
| 1081 | dumpsys_seg(paddr_t maddr, paddr_t bytes) |
| 1082 | { |
| 1083 | u_long i, m, n; |
| 1084 | daddr_t blkno; |
| 1085 | const struct bdevsw *bdev; |
| 1086 | int (*dump)(dev_t, daddr_t, void *, size_t); |
| 1087 | int error; |
| 1088 | |
| 1089 | if (dumpdev == NODEV) |
| 1090 | return ENODEV; |
| 1091 | bdev = bdevsw_lookup(dumpdev); |
| 1092 | if (bdev == NULL || bdev->d_psize == NULL) |
| 1093 | return ENODEV; |
| 1094 | |
| 1095 | dump = bdev->d_dump; |
| 1096 | |
| 1097 | blkno = dump_header_blkno; |
| 1098 | for (i = 0; i < bytes; i += n, dump_totalbytesleft -= n) { |
| 1099 | /* Print out how many MBs we have left to go. */ |
| 1100 | if ((dump_totalbytesleft % (1024*1024)) == 0) |
| 1101 | printf_nolog("%lu " , (unsigned long) |
| 1102 | (dump_totalbytesleft / (1024 * 1024))); |
| 1103 | |
| 1104 | /* Limit size for next transfer. */ |
| 1105 | n = bytes - i; |
| 1106 | if (n > BYTES_PER_DUMP) |
| 1107 | n = BYTES_PER_DUMP; |
| 1108 | |
| 1109 | for (m = 0; m < n; m += NBPG) |
| 1110 | pmap_kenter_pa(dumpspace + m, maddr + m, |
| 1111 | VM_PROT_READ, 0); |
| 1112 | pmap_update(pmap_kernel()); |
| 1113 | |
| 1114 | error = (*dump)(dumpdev, blkno, (void *)dumpspace, n); |
| 1115 | pmap_kremove_local(dumpspace, n); |
| 1116 | if (error) |
| 1117 | return error; |
| 1118 | maddr += n; |
| 1119 | blkno += btodb(n); /* XXX? */ |
| 1120 | |
| 1121 | #if 0 /* XXX this doesn't work. grr. */ |
| 1122 | /* operator aborting dump? */ |
| 1123 | if (sget() != NULL) |
| 1124 | return EINTR; |
| 1125 | #endif |
| 1126 | } |
| 1127 | dump_header_blkno = blkno; |
| 1128 | |
| 1129 | return 0; |
| 1130 | } |
| 1131 | |
| 1132 | void |
| 1133 | dodumpsys(void) |
| 1134 | { |
| 1135 | const struct bdevsw *bdev; |
| 1136 | int dumpend, psize; |
| 1137 | int error; |
| 1138 | |
| 1139 | if (dumpdev == NODEV) |
| 1140 | return; |
| 1141 | |
| 1142 | bdev = bdevsw_lookup(dumpdev); |
| 1143 | if (bdev == NULL || bdev->d_psize == NULL) |
| 1144 | return; |
| 1145 | /* |
| 1146 | * For dumps during autoconfiguration, |
| 1147 | * if dump device has already configured... |
| 1148 | */ |
| 1149 | if (dumpsize == 0) |
| 1150 | cpu_dumpconf(); |
| 1151 | |
| 1152 | printf("\ndumping to dev %llu,%llu (offset=%ld, size=%d):" , |
| 1153 | (unsigned long long)major(dumpdev), |
| 1154 | (unsigned long long)minor(dumpdev), dumplo, dumpsize); |
| 1155 | |
| 1156 | if (dumplo <= 0 || dumpsize <= 0) { |
| 1157 | printf(" not possible\n" ); |
| 1158 | return; |
| 1159 | } |
| 1160 | |
| 1161 | psize = bdev_size(dumpdev); |
| 1162 | printf("\ndump " ); |
| 1163 | if (psize == -1) { |
| 1164 | printf("area unavailable\n" ); |
| 1165 | return; |
| 1166 | } |
| 1167 | |
| 1168 | #if 0 /* XXX this doesn't work. grr. */ |
| 1169 | /* toss any characters present prior to dump */ |
| 1170 | while (sget() != NULL); /*syscons and pccons differ */ |
| 1171 | #endif |
| 1172 | |
| 1173 | dump_seg_prep(); |
| 1174 | dumpend = dumplo + btodb(dump_header_size) + ctod(dump_npages); |
| 1175 | if (dumpend > psize) { |
| 1176 | printf("failed: insufficient space (%d < %d)\n" , |
| 1177 | psize, dumpend); |
| 1178 | goto failed; |
| 1179 | } |
| 1180 | |
| 1181 | dump_header_start(); |
| 1182 | if ((error = cpu_dump()) != 0) |
| 1183 | goto err; |
| 1184 | if ((error = dump_header_finish()) != 0) |
| 1185 | goto err; |
| 1186 | |
| 1187 | if (dump_header_blkno != dumplo + btodb(dump_header_size)) { |
| 1188 | printf("BAD header size (%ld [written] != %ld [expected])\n" , |
| 1189 | (long)(dump_header_blkno - dumplo), |
| 1190 | (long)btodb(dump_header_size)); |
| 1191 | goto failed; |
| 1192 | } |
| 1193 | |
| 1194 | dump_totalbytesleft = roundup(ptoa(dump_npages), BYTES_PER_DUMP); |
| 1195 | error = dump_seg_iter(dumpsys_seg); |
| 1196 | |
| 1197 | if (error == 0 && dump_header_blkno != dumpend) { |
| 1198 | printf("BAD dump size (%ld [written] != %ld [expected])\n" , |
| 1199 | (long)(dumpend - dumplo), |
| 1200 | (long)(dump_header_blkno - dumplo)); |
| 1201 | goto failed; |
| 1202 | } |
| 1203 | |
| 1204 | err: |
| 1205 | switch (error) { |
| 1206 | |
| 1207 | case ENXIO: |
| 1208 | printf("device bad\n" ); |
| 1209 | break; |
| 1210 | |
| 1211 | case EFAULT: |
| 1212 | printf("device not ready\n" ); |
| 1213 | break; |
| 1214 | |
| 1215 | case EINVAL: |
| 1216 | printf("area improper\n" ); |
| 1217 | break; |
| 1218 | |
| 1219 | case EIO: |
| 1220 | printf("i/o error\n" ); |
| 1221 | break; |
| 1222 | |
| 1223 | case EINTR: |
| 1224 | printf("aborted from console\n" ); |
| 1225 | break; |
| 1226 | |
| 1227 | case 0: |
| 1228 | printf("succeeded\n" ); |
| 1229 | break; |
| 1230 | |
| 1231 | default: |
| 1232 | printf("error %d\n" , error); |
| 1233 | break; |
| 1234 | } |
| 1235 | failed: |
| 1236 | printf("\n\n" ); |
| 1237 | delay(5000000); /* 5 seconds */ |
| 1238 | } |
| 1239 | |
| 1240 | /* |
| 1241 | * This is called by main to set dumplo and dumpsize. |
| 1242 | * Dumps always skip the first PAGE_SIZE of disk space |
| 1243 | * in case there might be a disk label stored there. |
| 1244 | * If there is extra space, put dump at the end to |
| 1245 | * reduce the chance that swapping trashes it. |
| 1246 | * |
| 1247 | * Sparse dumps can't placed as close to the end as possible, because |
| 1248 | * savecore(8) has to know where to start reading in the dump device |
| 1249 | * before it has access to any of the crashed system's state. |
| 1250 | * |
| 1251 | * Note also that a sparse dump will never be larger than a full one: |
| 1252 | * in order to add a phys_ram_seg_t to the header, at least one page |
| 1253 | * must be removed. |
| 1254 | */ |
| 1255 | void |
| 1256 | cpu_dumpconf(void) |
| 1257 | { |
| 1258 | int nblks, dumpblks; /* size of dump area */ |
| 1259 | |
| 1260 | if (dumpdev == NODEV) |
| 1261 | goto bad; |
| 1262 | nblks = bdev_size(dumpdev); |
| 1263 | if (nblks <= ctod(1)) |
| 1264 | goto bad; |
| 1265 | |
| 1266 | dumpblks = cpu_dumpsize(); |
| 1267 | if (dumpblks < 0) |
| 1268 | goto bad; |
| 1269 | |
| 1270 | /* dumpsize is in page units, and doesn't include headers. */ |
| 1271 | dumpsize = cpu_dump_mempagecnt(); |
| 1272 | |
| 1273 | dumpblks += ctod(dumpsize); |
| 1274 | |
| 1275 | /* If dump won't fit (incl. room for possible label), punt. */ |
| 1276 | if (dumpblks > (nblks - ctod(1))) { |
| 1277 | #ifndef NO_SPARSE_DUMP |
| 1278 | /* A sparse dump might (and hopefully will) fit. */ |
| 1279 | dumplo = ctod(1); |
| 1280 | #else |
| 1281 | /* But if we're not configured for that, punt. */ |
| 1282 | goto bad; |
| 1283 | #endif |
| 1284 | } else { |
| 1285 | /* Put dump at end of partition */ |
| 1286 | dumplo = nblks - dumpblks; |
| 1287 | } |
| 1288 | |
| 1289 | |
| 1290 | /* Now that we've decided this will work, init ancillary stuff. */ |
| 1291 | dump_misc_init(); |
| 1292 | return; |
| 1293 | |
| 1294 | bad: |
| 1295 | dumpsize = 0; |
| 1296 | } |
| 1297 | |
| 1298 | /* |
| 1299 | * Clear registers on exec |
| 1300 | */ |
| 1301 | void |
| 1302 | setregs(struct lwp *l, struct exec_package *pack, vaddr_t stack) |
| 1303 | { |
| 1304 | struct pcb *pcb = lwp_getpcb(l); |
| 1305 | struct trapframe *tf; |
| 1306 | |
| 1307 | #ifdef USER_LDT |
| 1308 | pmap_ldt_cleanup(l); |
| 1309 | #endif |
| 1310 | |
| 1311 | fpu_save_area_clear(l, pack->ep_osversion >= 699002600 |
| 1312 | ? __NetBSD_NPXCW__ : __NetBSD_COMPAT_NPXCW__); |
| 1313 | pcb->pcb_flags = 0; |
| 1314 | |
| 1315 | l->l_proc->p_flag &= ~PK_32; |
| 1316 | |
| 1317 | tf = l->l_md.md_regs; |
| 1318 | tf->tf_ds = LSEL(LUDATA_SEL, SEL_UPL); |
| 1319 | tf->tf_es = LSEL(LUDATA_SEL, SEL_UPL); |
| 1320 | cpu_fsgs_zero(l); |
| 1321 | tf->tf_rdi = 0; |
| 1322 | tf->tf_rsi = 0; |
| 1323 | tf->tf_rbp = 0; |
| 1324 | tf->tf_rbx = l->l_proc->p_psstrp; |
| 1325 | tf->tf_rdx = 0; |
| 1326 | tf->tf_rcx = 0; |
| 1327 | tf->tf_rax = 0; |
| 1328 | tf->tf_rip = pack->ep_entry; |
| 1329 | tf->tf_cs = LSEL(LUCODE_SEL, SEL_UPL); |
| 1330 | tf->tf_rflags = PSL_USERSET; |
| 1331 | tf->tf_rsp = stack; |
| 1332 | tf->tf_ss = LSEL(LUDATA_SEL, SEL_UPL); |
| 1333 | } |
| 1334 | |
| 1335 | /* |
| 1336 | * Initialize segments and descriptor tables |
| 1337 | */ |
| 1338 | |
| 1339 | #ifdef XEN |
| 1340 | struct trap_info *xen_idt; |
| 1341 | int xen_idt_idx; |
| 1342 | #endif |
| 1343 | char *ldtstore; |
| 1344 | char *gdtstore; |
| 1345 | |
| 1346 | void |
| 1347 | setgate(struct gate_descriptor *gd, void *func, int ist, int type, int dpl, int sel) |
| 1348 | { |
| 1349 | |
| 1350 | kpreempt_disable(); |
| 1351 | pmap_changeprot_local(idt_vaddr, VM_PROT_READ|VM_PROT_WRITE); |
| 1352 | |
| 1353 | gd->gd_looffset = (uint64_t)func & 0xffff; |
| 1354 | gd->gd_selector = sel; |
| 1355 | gd->gd_ist = ist; |
| 1356 | gd->gd_type = type; |
| 1357 | gd->gd_dpl = dpl; |
| 1358 | gd->gd_p = 1; |
| 1359 | gd->gd_hioffset = (uint64_t)func >> 16; |
| 1360 | gd->gd_zero = 0; |
| 1361 | gd->gd_xx1 = 0; |
| 1362 | gd->gd_xx2 = 0; |
| 1363 | gd->gd_xx3 = 0; |
| 1364 | |
| 1365 | pmap_changeprot_local(idt_vaddr, VM_PROT_READ); |
| 1366 | kpreempt_enable(); |
| 1367 | } |
| 1368 | |
| 1369 | void |
| 1370 | unsetgate(struct gate_descriptor *gd) |
| 1371 | { |
| 1372 | |
| 1373 | kpreempt_disable(); |
| 1374 | pmap_changeprot_local(idt_vaddr, VM_PROT_READ|VM_PROT_WRITE); |
| 1375 | |
| 1376 | memset(gd, 0, sizeof (*gd)); |
| 1377 | |
| 1378 | pmap_changeprot_local(idt_vaddr, VM_PROT_READ); |
| 1379 | kpreempt_enable(); |
| 1380 | } |
| 1381 | |
| 1382 | void |
| 1383 | setregion(struct region_descriptor *rd, void *base, uint16_t limit) |
| 1384 | { |
| 1385 | rd->rd_limit = limit; |
| 1386 | rd->rd_base = (uint64_t)base; |
| 1387 | } |
| 1388 | |
| 1389 | /* |
| 1390 | * Note that the base and limit fields are ignored in long mode. |
| 1391 | */ |
| 1392 | void |
| 1393 | set_mem_segment(struct mem_segment_descriptor *sd, void *base, size_t limit, |
| 1394 | int type, int dpl, int gran, int def32, int is64) |
| 1395 | { |
| 1396 | sd->sd_lolimit = (unsigned)limit; |
| 1397 | sd->sd_lobase = (unsigned long)base; |
| 1398 | sd->sd_type = type; |
| 1399 | sd->sd_dpl = dpl; |
| 1400 | sd->sd_p = 1; |
| 1401 | sd->sd_hilimit = (unsigned)limit >> 16; |
| 1402 | sd->sd_avl = 0; |
| 1403 | sd->sd_long = is64; |
| 1404 | sd->sd_def32 = def32; |
| 1405 | sd->sd_gran = gran; |
| 1406 | sd->sd_hibase = (unsigned long)base >> 24; |
| 1407 | } |
| 1408 | |
| 1409 | void |
| 1410 | set_sys_segment(struct sys_segment_descriptor *sd, void *base, size_t limit, |
| 1411 | int type, int dpl, int gran) |
| 1412 | { |
| 1413 | memset(sd, 0, sizeof *sd); |
| 1414 | sd->sd_lolimit = (unsigned)limit; |
| 1415 | sd->sd_lobase = (uint64_t)base; |
| 1416 | sd->sd_type = type; |
| 1417 | sd->sd_dpl = dpl; |
| 1418 | sd->sd_p = 1; |
| 1419 | sd->sd_hilimit = (unsigned)limit >> 16; |
| 1420 | sd->sd_gran = gran; |
| 1421 | sd->sd_hibase = (uint64_t)base >> 24; |
| 1422 | } |
| 1423 | |
| 1424 | void |
| 1425 | cpu_init_idt(void) |
| 1426 | { |
| 1427 | #ifndef XEN |
| 1428 | struct region_descriptor region; |
| 1429 | |
| 1430 | setregion(®ion, idt, NIDT * sizeof(idt[0]) - 1); |
| 1431 | lidt(®ion); |
| 1432 | #else |
| 1433 | if (HYPERVISOR_set_trap_table(xen_idt)) |
| 1434 | panic("HYPERVISOR_set_trap_table() failed" ); |
| 1435 | #endif |
| 1436 | } |
| 1437 | |
| 1438 | #define IDTVEC(name) __CONCAT(X, name) |
| 1439 | typedef void (vector)(void); |
| 1440 | extern vector IDTVEC(syscall); |
| 1441 | extern vector IDTVEC(syscall32); |
| 1442 | extern vector IDTVEC(osyscall); |
| 1443 | extern vector IDTVEC(oosyscall); |
| 1444 | extern vector *IDTVEC(exceptions)[]; |
| 1445 | |
| 1446 | static void |
| 1447 | init_x86_64_msgbuf(void) |
| 1448 | { |
| 1449 | /* Message buffer is located at end of core. */ |
| 1450 | struct vm_physseg *vps; |
| 1451 | psize_t sz = round_page(MSGBUFSIZE); |
| 1452 | psize_t reqsz = sz; |
| 1453 | int x; |
| 1454 | |
| 1455 | search_again: |
| 1456 | vps = NULL; |
| 1457 | |
| 1458 | for (x = 0; x < vm_nphysseg; x++) { |
| 1459 | vps = VM_PHYSMEM_PTR(x); |
| 1460 | if (ctob(vps->avail_end) == avail_end) |
| 1461 | break; |
| 1462 | } |
| 1463 | if (x == vm_nphysseg) |
| 1464 | panic("init_x86_64: can't find end of memory" ); |
| 1465 | |
| 1466 | /* Shrink so it'll fit in the last segment. */ |
| 1467 | if ((vps->avail_end - vps->avail_start) < atop(sz)) |
| 1468 | sz = ctob(vps->avail_end - vps->avail_start); |
| 1469 | |
| 1470 | vps->avail_end -= atop(sz); |
| 1471 | vps->end -= atop(sz); |
| 1472 | msgbuf_p_seg[msgbuf_p_cnt].sz = sz; |
| 1473 | msgbuf_p_seg[msgbuf_p_cnt++].paddr = ctob(vps->avail_end); |
| 1474 | |
| 1475 | /* Remove the last segment if it now has no pages. */ |
| 1476 | if (vps->start == vps->end) { |
| 1477 | for (vm_nphysseg--; x < vm_nphysseg; x++) |
| 1478 | VM_PHYSMEM_PTR_SWAP(x, x + 1); |
| 1479 | } |
| 1480 | |
| 1481 | /* Now find where the new avail_end is. */ |
| 1482 | for (avail_end = 0, x = 0; x < vm_nphysseg; x++) |
| 1483 | if (VM_PHYSMEM_PTR(x)->avail_end > avail_end) |
| 1484 | avail_end = VM_PHYSMEM_PTR(x)->avail_end; |
| 1485 | avail_end = ctob(avail_end); |
| 1486 | |
| 1487 | if (sz == reqsz) |
| 1488 | return; |
| 1489 | |
| 1490 | reqsz -= sz; |
| 1491 | if (msgbuf_p_cnt == VM_PHYSSEG_MAX) { |
| 1492 | /* No more segments available, bail out. */ |
| 1493 | printf("WARNING: MSGBUFSIZE (%zu) too large, using %zu.\n" , |
| 1494 | (size_t)MSGBUFSIZE, (size_t)(MSGBUFSIZE - reqsz)); |
| 1495 | return; |
| 1496 | } |
| 1497 | |
| 1498 | sz = reqsz; |
| 1499 | goto search_again; |
| 1500 | } |
| 1501 | |
| 1502 | static void |
| 1503 | init_x86_64_ksyms(void) |
| 1504 | { |
| 1505 | #if NKSYMS || defined(DDB) || defined(MODULAR) |
| 1506 | extern int end; |
| 1507 | extern int *esym; |
| 1508 | #ifndef XEN |
| 1509 | struct btinfo_symtab *symtab; |
| 1510 | vaddr_t tssym, tesym; |
| 1511 | #endif |
| 1512 | |
| 1513 | #ifdef DDB |
| 1514 | db_machine_init(); |
| 1515 | #endif |
| 1516 | |
| 1517 | #ifndef XEN |
| 1518 | symtab = lookup_bootinfo(BTINFO_SYMTAB); |
| 1519 | if (symtab) { |
| 1520 | tssym = (vaddr_t)symtab->ssym + KERNBASE; |
| 1521 | tesym = (vaddr_t)symtab->esym + KERNBASE; |
| 1522 | ksyms_addsyms_elf(symtab->nsym, (void *)tssym, (void *)tesym); |
| 1523 | } else |
| 1524 | ksyms_addsyms_elf(*(long *)(void *)&end, |
| 1525 | ((long *)(void *)&end) + 1, esym); |
| 1526 | #else /* XEN */ |
| 1527 | esym = xen_start_info.mod_start ? |
| 1528 | (void *)xen_start_info.mod_start : |
| 1529 | (void *)xen_start_info.mfn_list; |
| 1530 | ksyms_addsyms_elf(*(int *)(void *)&end, |
| 1531 | ((int *)(void *)&end) + 1, esym); |
| 1532 | #endif /* XEN */ |
| 1533 | #endif |
| 1534 | } |
| 1535 | |
| 1536 | void |
| 1537 | init_x86_64(paddr_t first_avail) |
| 1538 | { |
| 1539 | extern void consinit(void); |
| 1540 | struct region_descriptor region; |
| 1541 | struct mem_segment_descriptor *ldt_segp; |
| 1542 | int x; |
| 1543 | #ifndef XEN |
| 1544 | int ist; |
| 1545 | #endif |
| 1546 | |
| 1547 | KASSERT(first_avail % PAGE_SIZE == 0); |
| 1548 | |
| 1549 | #ifdef XEN |
| 1550 | KASSERT(HYPERVISOR_shared_info != NULL); |
| 1551 | cpu_info_primary.ci_vcpu = &HYPERVISOR_shared_info->vcpu_info[0]; |
| 1552 | |
| 1553 | __PRINTK(("init_x86_64(0x%lx)\n" , first_avail)); |
| 1554 | #endif /* XEN */ |
| 1555 | |
| 1556 | cpu_probe(&cpu_info_primary); |
| 1557 | cpu_init_msrs(&cpu_info_primary, true); |
| 1558 | |
| 1559 | use_pae = 1; /* PAE always enabled in long mode */ |
| 1560 | |
| 1561 | #ifdef XEN |
| 1562 | struct pcb *pcb = lwp_getpcb(&lwp0); |
| 1563 | mutex_init(&pte_lock, MUTEX_DEFAULT, IPL_VM); |
| 1564 | pcb->pcb_cr3 = xen_start_info.pt_base - KERNBASE; |
| 1565 | __PRINTK(("pcb_cr3 0x%lx\n" , xen_start_info.pt_base - KERNBASE)); |
| 1566 | #endif |
| 1567 | |
| 1568 | #if NISA > 0 || NPCI > 0 |
| 1569 | x86_bus_space_init(); |
| 1570 | #endif |
| 1571 | |
| 1572 | consinit(); /* XXX SHOULD NOT BE DONE HERE */ |
| 1573 | |
| 1574 | /* |
| 1575 | * Initialize PAGE_SIZE-dependent variables. |
| 1576 | */ |
| 1577 | uvm_setpagesize(); |
| 1578 | |
| 1579 | uvmexp.ncolors = 2; |
| 1580 | |
| 1581 | #ifndef XEN |
| 1582 | /* |
| 1583 | * Low memory reservations: |
| 1584 | * Page 0: BIOS data |
| 1585 | * Page 1: BIOS callback (not used yet, for symmetry with i386) |
| 1586 | * Page 2: MP bootstrap code (MP_TRAMPOLINE) |
| 1587 | * Page 3: ACPI wakeup code (ACPI_WAKEUP_ADDR) |
| 1588 | * Page 4: Temporary page table for 0MB-4MB |
| 1589 | * Page 5: Temporary page directory |
| 1590 | * Page 6: Temporary page map level 3 |
| 1591 | * Page 7: Temporary page map level 4 |
| 1592 | */ |
| 1593 | avail_start = 8 * PAGE_SIZE; |
| 1594 | |
| 1595 | /* Initialize the memory clusters (needed in pmap_boostrap). */ |
| 1596 | init_x86_clusters(); |
| 1597 | #else /* XEN */ |
| 1598 | /* Parse Xen command line (replace bootinfo) */ |
| 1599 | xen_parse_cmdline(XEN_PARSE_BOOTFLAGS, NULL); |
| 1600 | |
| 1601 | /* Determine physical address space */ |
| 1602 | avail_start = first_avail; |
| 1603 | avail_end = ctob(xen_start_info.nr_pages); |
| 1604 | pmap_pa_start = (KERNTEXTOFF - KERNBASE); |
| 1605 | pmap_pa_end = avail_end; |
| 1606 | __PRINTK(("pmap_pa_start 0x%lx avail_start 0x%lx avail_end 0x%lx\n" , |
| 1607 | pmap_pa_start, avail_start, avail_end)); |
| 1608 | #endif /* !XEN */ |
| 1609 | |
| 1610 | /* End of the virtual space we have created so far. */ |
| 1611 | kern_end = KERNBASE + first_avail; |
| 1612 | |
| 1613 | /* |
| 1614 | * Call pmap initialization to make new kernel address space. |
| 1615 | * We must do this before loading pages into the VM system. |
| 1616 | */ |
| 1617 | pmap_bootstrap(VM_MIN_KERNEL_ADDRESS); |
| 1618 | |
| 1619 | #ifndef XEN |
| 1620 | /* Internalize the physical pages into the VM system. */ |
| 1621 | init_x86_vm(first_avail); |
| 1622 | #else /* XEN */ |
| 1623 | physmem = xen_start_info.nr_pages; |
| 1624 | |
| 1625 | uvm_page_physload(atop(avail_start), |
| 1626 | atop(avail_end), atop(avail_start), |
| 1627 | atop(avail_end), VM_FREELIST_DEFAULT); |
| 1628 | #endif /* !XEN */ |
| 1629 | |
| 1630 | init_x86_64_msgbuf(); |
| 1631 | |
| 1632 | pmap_growkernel(VM_MIN_KERNEL_ADDRESS + 32 * 1024 * 1024); |
| 1633 | |
| 1634 | kpreempt_disable(); |
| 1635 | |
| 1636 | pmap_kenter_pa(idt_vaddr, idt_paddr, VM_PROT_READ|VM_PROT_WRITE, 0); |
| 1637 | pmap_kenter_pa(gdt_vaddr, gdt_paddr, VM_PROT_READ|VM_PROT_WRITE, 0); |
| 1638 | pmap_kenter_pa(ldt_vaddr, ldt_paddr, VM_PROT_READ|VM_PROT_WRITE, 0); |
| 1639 | pmap_update(pmap_kernel()); |
| 1640 | memset((void *)idt_vaddr, 0, PAGE_SIZE); |
| 1641 | memset((void *)gdt_vaddr, 0, PAGE_SIZE); |
| 1642 | memset((void *)ldt_vaddr, 0, PAGE_SIZE); |
| 1643 | |
| 1644 | #ifndef XEN |
| 1645 | pmap_changeprot_local(idt_vaddr, VM_PROT_READ); |
| 1646 | #endif |
| 1647 | |
| 1648 | pmap_update(pmap_kernel()); |
| 1649 | |
| 1650 | #ifndef XEN |
| 1651 | idt = (struct gate_descriptor *)idt_vaddr; |
| 1652 | #else |
| 1653 | xen_idt = (struct trap_info *)idt_vaddr; |
| 1654 | xen_idt_idx = 0; |
| 1655 | #endif |
| 1656 | gdtstore = (char *)gdt_vaddr; |
| 1657 | ldtstore = (char *)ldt_vaddr; |
| 1658 | |
| 1659 | /* |
| 1660 | * Make GDT gates and memory segments. |
| 1661 | */ |
| 1662 | set_mem_segment(GDT_ADDR_MEM(gdtstore, GCODE_SEL), 0, |
| 1663 | 0xfffff, SDT_MEMERA, SEL_KPL, 1, 0, 1); |
| 1664 | |
| 1665 | set_mem_segment(GDT_ADDR_MEM(gdtstore, GDATA_SEL), 0, |
| 1666 | 0xfffff, SDT_MEMRWA, SEL_KPL, 1, 0, 1); |
| 1667 | |
| 1668 | set_mem_segment(GDT_ADDR_MEM(gdtstore, GUCODE_SEL), 0, |
| 1669 | x86_btop(VM_MAXUSER_ADDRESS) - 1, SDT_MEMERA, SEL_UPL, 1, 0, 1); |
| 1670 | |
| 1671 | set_mem_segment(GDT_ADDR_MEM(gdtstore, GUDATA_SEL), 0, |
| 1672 | x86_btop(VM_MAXUSER_ADDRESS) - 1, SDT_MEMRWA, SEL_UPL, 1, 0, 1); |
| 1673 | |
| 1674 | #ifndef XEN |
| 1675 | set_sys_segment(GDT_ADDR_SYS(gdtstore, GLDT_SEL), ldtstore, |
| 1676 | LDT_SIZE - 1, SDT_SYSLDT, SEL_KPL, 0); |
| 1677 | #endif |
| 1678 | |
| 1679 | /* |
| 1680 | * Make LDT gates and memory segments. |
| 1681 | */ |
| 1682 | setgate((struct gate_descriptor *)(ldtstore + LSYS5CALLS_SEL), |
| 1683 | &IDTVEC(oosyscall), 0, SDT_SYS386CGT, SEL_UPL, |
| 1684 | GSEL(GCODE_SEL, SEL_KPL)); |
| 1685 | *(struct mem_segment_descriptor *)(ldtstore + LUCODE_SEL) = |
| 1686 | *GDT_ADDR_MEM(gdtstore, GUCODE_SEL); |
| 1687 | *(struct mem_segment_descriptor *)(ldtstore + LUDATA_SEL) = |
| 1688 | *GDT_ADDR_MEM(gdtstore, GUDATA_SEL); |
| 1689 | |
| 1690 | /* |
| 1691 | * 32 bit GDT entries. |
| 1692 | */ |
| 1693 | set_mem_segment(GDT_ADDR_MEM(gdtstore, GUCODE32_SEL), 0, |
| 1694 | x86_btop(VM_MAXUSER_ADDRESS32) - 1, SDT_MEMERA, SEL_UPL, 1, 1, 0); |
| 1695 | |
| 1696 | set_mem_segment(GDT_ADDR_MEM(gdtstore, GUDATA32_SEL), 0, |
| 1697 | x86_btop(VM_MAXUSER_ADDRESS32) - 1, SDT_MEMRWA, SEL_UPL, 1, 1, 0); |
| 1698 | |
| 1699 | set_mem_segment(GDT_ADDR_MEM(gdtstore, GUFS_SEL), 0, |
| 1700 | x86_btop(VM_MAXUSER_ADDRESS32) - 1, SDT_MEMRWA, SEL_UPL, 1, 1, 0); |
| 1701 | |
| 1702 | set_mem_segment(GDT_ADDR_MEM(gdtstore, GUGS_SEL), 0, |
| 1703 | x86_btop(VM_MAXUSER_ADDRESS32) - 1, SDT_MEMRWA, SEL_UPL, 1, 1, 0); |
| 1704 | |
| 1705 | /* |
| 1706 | * 32 bit LDT entries. |
| 1707 | */ |
| 1708 | ldt_segp = (struct mem_segment_descriptor *)(ldtstore + LUCODE32_SEL); |
| 1709 | set_mem_segment(ldt_segp, 0, x86_btop(VM_MAXUSER_ADDRESS32) - 1, |
| 1710 | SDT_MEMERA, SEL_UPL, 1, 1, 0); |
| 1711 | ldt_segp = (struct mem_segment_descriptor *)(ldtstore + LUDATA32_SEL); |
| 1712 | set_mem_segment(ldt_segp, 0, x86_btop(VM_MAXUSER_ADDRESS32) - 1, |
| 1713 | SDT_MEMRWA, SEL_UPL, 1, 1, 0); |
| 1714 | |
| 1715 | /* |
| 1716 | * Other LDT entries. |
| 1717 | */ |
| 1718 | memcpy((struct gate_descriptor *)(ldtstore + LSOL26CALLS_SEL), |
| 1719 | (struct gate_descriptor *)(ldtstore + LSYS5CALLS_SEL), |
| 1720 | sizeof (struct gate_descriptor)); |
| 1721 | memcpy((struct gate_descriptor *)(ldtstore + LBSDICALLS_SEL), |
| 1722 | (struct gate_descriptor *)(ldtstore + LSYS5CALLS_SEL), |
| 1723 | sizeof (struct gate_descriptor)); |
| 1724 | |
| 1725 | /* CPU-specific IDT exceptions. */ |
| 1726 | for (x = 0; x < NCPUIDT; x++) { |
| 1727 | #ifndef XEN |
| 1728 | idt_vec_reserve(x); |
| 1729 | switch (x) { |
| 1730 | case 2: /* NMI */ |
| 1731 | ist = 3; |
| 1732 | break; |
| 1733 | case 8: /* double fault */ |
| 1734 | ist = 2; |
| 1735 | break; |
| 1736 | default: |
| 1737 | ist = 0; |
| 1738 | break; |
| 1739 | } |
| 1740 | setgate(&idt[x], IDTVEC(exceptions)[x], ist, SDT_SYS386IGT, |
| 1741 | (x == 3 || x == 4) ? SEL_UPL : SEL_KPL, |
| 1742 | GSEL(GCODE_SEL, SEL_KPL)); |
| 1743 | #else /* XEN */ |
| 1744 | pmap_changeprot_local(idt_vaddr, VM_PROT_READ|VM_PROT_WRITE); |
| 1745 | xen_idt[xen_idt_idx].vector = x; |
| 1746 | |
| 1747 | switch (x) { |
| 1748 | case 2: /* NMI */ |
| 1749 | case 18: /* MCA */ |
| 1750 | TI_SET_IF(&(xen_idt[xen_idt_idx]), 2); |
| 1751 | break; |
| 1752 | case 3: |
| 1753 | case 4: |
| 1754 | xen_idt[xen_idt_idx].flags = SEL_UPL; |
| 1755 | break; |
| 1756 | default: |
| 1757 | xen_idt[xen_idt_idx].flags = SEL_KPL; |
| 1758 | break; |
| 1759 | } |
| 1760 | |
| 1761 | xen_idt[xen_idt_idx].cs = GSEL(GCODE_SEL, SEL_KPL); |
| 1762 | xen_idt[xen_idt_idx].address = |
| 1763 | (unsigned long)IDTVEC(exceptions)[x]; |
| 1764 | xen_idt_idx++; |
| 1765 | #endif /* XEN */ |
| 1766 | } |
| 1767 | |
| 1768 | /* new-style interrupt gate for syscalls */ |
| 1769 | #ifndef XEN |
| 1770 | idt_vec_reserve(128); |
| 1771 | setgate(&idt[128], &IDTVEC(osyscall), 0, SDT_SYS386IGT, SEL_UPL, |
| 1772 | GSEL(GCODE_SEL, SEL_KPL)); |
| 1773 | #else |
| 1774 | xen_idt[xen_idt_idx].vector = 128; |
| 1775 | xen_idt[xen_idt_idx].flags = SEL_KPL; |
| 1776 | xen_idt[xen_idt_idx].cs = GSEL(GCODE_SEL, SEL_KPL); |
| 1777 | xen_idt[xen_idt_idx].address = (unsigned long) &IDTVEC(osyscall); |
| 1778 | xen_idt_idx++; |
| 1779 | pmap_changeprot_local(idt_vaddr, VM_PROT_READ); |
| 1780 | #endif /* XEN */ |
| 1781 | kpreempt_enable(); |
| 1782 | |
| 1783 | setregion(®ion, gdtstore, DYNSEL_START - 1); |
| 1784 | lgdt(®ion); |
| 1785 | |
| 1786 | #ifdef XEN |
| 1787 | /* Init Xen callbacks and syscall handlers */ |
| 1788 | if (HYPERVISOR_set_callbacks( |
| 1789 | (unsigned long) hypervisor_callback, |
| 1790 | (unsigned long) failsafe_callback, |
| 1791 | (unsigned long) Xsyscall)) |
| 1792 | panic("HYPERVISOR_set_callbacks() failed" ); |
| 1793 | #endif /* XEN */ |
| 1794 | cpu_init_idt(); |
| 1795 | |
| 1796 | init_x86_64_ksyms(); |
| 1797 | |
| 1798 | #ifndef XEN |
| 1799 | intr_default_setup(); |
| 1800 | #else |
| 1801 | events_default_setup(); |
| 1802 | #endif |
| 1803 | |
| 1804 | splraise(IPL_HIGH); |
| 1805 | x86_enable_intr(); |
| 1806 | |
| 1807 | #ifdef DDB |
| 1808 | if (boothowto & RB_KDB) |
| 1809 | Debugger(); |
| 1810 | #endif |
| 1811 | #ifdef KGDB |
| 1812 | kgdb_port_init(); |
| 1813 | if (boothowto & RB_KDB) { |
| 1814 | kgdb_debug_init = 1; |
| 1815 | kgdb_connect(1); |
| 1816 | } |
| 1817 | #endif |
| 1818 | } |
| 1819 | |
| 1820 | void |
| 1821 | cpu_reset(void) |
| 1822 | { |
| 1823 | x86_disable_intr(); |
| 1824 | |
| 1825 | #ifdef XEN |
| 1826 | HYPERVISOR_reboot(); |
| 1827 | #else |
| 1828 | |
| 1829 | x86_reset(); |
| 1830 | |
| 1831 | /* |
| 1832 | * Try to cause a triple fault and watchdog reset by making the IDT |
| 1833 | * invalid and causing a fault. |
| 1834 | */ |
| 1835 | kpreempt_disable(); |
| 1836 | pmap_changeprot_local(idt_vaddr, VM_PROT_READ|VM_PROT_WRITE); |
| 1837 | memset((void *)idt, 0, NIDT * sizeof(idt[0])); |
| 1838 | kpreempt_enable(); |
| 1839 | breakpoint(); |
| 1840 | |
| 1841 | #if 0 |
| 1842 | /* |
| 1843 | * Try to cause a triple fault and watchdog reset by unmapping the |
| 1844 | * entire address space and doing a TLB flush. |
| 1845 | */ |
| 1846 | memset((void *)PTD, 0, PAGE_SIZE); |
| 1847 | tlbflush(); |
| 1848 | #endif |
| 1849 | #endif /* XEN */ |
| 1850 | |
| 1851 | for (;;); |
| 1852 | } |
| 1853 | |
| 1854 | void |
| 1855 | cpu_getmcontext(struct lwp *l, mcontext_t *mcp, unsigned int *flags) |
| 1856 | { |
| 1857 | const struct trapframe *tf = l->l_md.md_regs; |
| 1858 | __greg_t ras_rip; |
| 1859 | |
| 1860 | /* Copy general registers member by member */ |
| 1861 | #define copy_from_tf(reg, REG, idx) mcp->__gregs[_REG_##REG] = tf->tf_##reg; |
| 1862 | _FRAME_GREG(copy_from_tf) |
| 1863 | #undef copy_from_tf |
| 1864 | |
| 1865 | if ((ras_rip = (__greg_t)ras_lookup(l->l_proc, |
| 1866 | (void *) mcp->__gregs[_REG_RIP])) != -1) |
| 1867 | mcp->__gregs[_REG_RIP] = ras_rip; |
| 1868 | |
| 1869 | *flags |= _UC_CPU; |
| 1870 | |
| 1871 | mcp->_mc_tlsbase = (uintptr_t)l->l_private; |
| 1872 | *flags |= _UC_TLSBASE; |
| 1873 | |
| 1874 | process_read_fpregs_xmm(l, (struct fxsave *)&mcp->__fpregs); |
| 1875 | *flags |= _UC_FPU; |
| 1876 | } |
| 1877 | |
| 1878 | int |
| 1879 | cpu_setmcontext(struct lwp *l, const mcontext_t *mcp, unsigned int flags) |
| 1880 | { |
| 1881 | struct trapframe *tf = l->l_md.md_regs; |
| 1882 | const __greg_t *gr = mcp->__gregs; |
| 1883 | struct proc *p = l->l_proc; |
| 1884 | int error; |
| 1885 | int err, trapno; |
| 1886 | int64_t rflags; |
| 1887 | |
| 1888 | CTASSERT(sizeof (mcontext_t) == 26 * 8 + 8 + 512); |
| 1889 | |
| 1890 | if ((flags & _UC_CPU) != 0) { |
| 1891 | error = cpu_mcontext_validate(l, mcp); |
| 1892 | if (error != 0) |
| 1893 | return error; |
| 1894 | /* |
| 1895 | * save and restore some values we don't want to change. |
| 1896 | * _FRAME_GREG(copy_to_tf) below overwrites them. |
| 1897 | * |
| 1898 | * XXX maybe inline this. |
| 1899 | */ |
| 1900 | rflags = tf->tf_rflags; |
| 1901 | err = tf->tf_err; |
| 1902 | trapno = tf->tf_trapno; |
| 1903 | |
| 1904 | /* Copy general registers member by member */ |
| 1905 | #define copy_to_tf(reg, REG, idx) tf->tf_##reg = gr[_REG_##REG]; |
| 1906 | _FRAME_GREG(copy_to_tf) |
| 1907 | #undef copy_to_tf |
| 1908 | |
| 1909 | #ifdef XEN |
| 1910 | /* |
| 1911 | * Xen has its own way of dealing with %cs and %ss, |
| 1912 | * reset it to proper values. |
| 1913 | */ |
| 1914 | tf->tf_ss = GSEL(GUDATA_SEL, SEL_UPL); |
| 1915 | tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL); |
| 1916 | #endif |
| 1917 | rflags &= ~PSL_USER; |
| 1918 | tf->tf_rflags = rflags | (gr[_REG_RFLAGS] & PSL_USER); |
| 1919 | tf->tf_err = err; |
| 1920 | tf->tf_trapno = trapno; |
| 1921 | |
| 1922 | l->l_md.md_flags |= MDL_IRET; |
| 1923 | } |
| 1924 | |
| 1925 | if ((flags & _UC_FPU) != 0) |
| 1926 | process_write_fpregs_xmm(l, (const struct fxsave *)&mcp->__fpregs); |
| 1927 | |
| 1928 | if ((flags & _UC_TLSBASE) != 0) |
| 1929 | lwp_setprivate(l, (void *)(uintptr_t)mcp->_mc_tlsbase); |
| 1930 | |
| 1931 | mutex_enter(p->p_lock); |
| 1932 | if (flags & _UC_SETSTACK) |
| 1933 | l->l_sigstk.ss_flags |= SS_ONSTACK; |
| 1934 | if (flags & _UC_CLRSTACK) |
| 1935 | l->l_sigstk.ss_flags &= ~SS_ONSTACK; |
| 1936 | mutex_exit(p->p_lock); |
| 1937 | |
| 1938 | return 0; |
| 1939 | } |
| 1940 | |
| 1941 | int |
| 1942 | cpu_mcontext_validate(struct lwp *l, const mcontext_t *mcp) |
| 1943 | { |
| 1944 | const __greg_t *gr; |
| 1945 | uint16_t sel; |
| 1946 | int error; |
| 1947 | struct pmap *pmap = l->l_proc->p_vmspace->vm_map.pmap; |
| 1948 | struct proc *p = l->l_proc; |
| 1949 | struct trapframe *tf = l->l_md.md_regs; |
| 1950 | |
| 1951 | gr = mcp->__gregs; |
| 1952 | |
| 1953 | if (((gr[_REG_RFLAGS] ^ tf->tf_rflags) & PSL_USERSTATIC) != 0) |
| 1954 | return EINVAL; |
| 1955 | |
| 1956 | if (__predict_false(pmap->pm_ldt != NULL)) { |
| 1957 | error = valid_user_selector(l, gr[_REG_ES]); |
| 1958 | if (error != 0) |
| 1959 | return error; |
| 1960 | |
| 1961 | error = valid_user_selector(l, gr[_REG_FS]); |
| 1962 | if (error != 0) |
| 1963 | return error; |
| 1964 | |
| 1965 | error = valid_user_selector(l, gr[_REG_GS]); |
| 1966 | if (error != 0) |
| 1967 | return error; |
| 1968 | |
| 1969 | if ((gr[_REG_DS] & 0xffff) == 0) |
| 1970 | return EINVAL; |
| 1971 | error = valid_user_selector(l, gr[_REG_DS]); |
| 1972 | if (error != 0) |
| 1973 | return error; |
| 1974 | |
| 1975 | #ifndef XEN |
| 1976 | if ((gr[_REG_SS] & 0xffff) == 0) |
| 1977 | return EINVAL; |
| 1978 | error = valid_user_selector(l, gr[_REG_SS]); |
| 1979 | if (error != 0) |
| 1980 | return error; |
| 1981 | #endif |
| 1982 | } else { |
| 1983 | #define VUD(sel) \ |
| 1984 | ((p->p_flag & PK_32) ? VALID_USER_DSEL32(sel) : VALID_USER_DSEL(sel)) |
| 1985 | sel = gr[_REG_ES] & 0xffff; |
| 1986 | if (sel != 0 && !VUD(sel)) |
| 1987 | return EINVAL; |
| 1988 | |
| 1989 | /* XXX: Shouldn't this be FSEL32? */ |
| 1990 | #define VUF(sel) \ |
| 1991 | ((p->p_flag & PK_32) ? VALID_USER_DSEL32(sel) : VALID_USER_DSEL(sel)) |
| 1992 | sel = gr[_REG_FS] & 0xffff; |
| 1993 | if (sel != 0 && !VUF(sel)) |
| 1994 | return EINVAL; |
| 1995 | |
| 1996 | #define VUG(sel) \ |
| 1997 | ((p->p_flag & PK_32) ? VALID_USER_GSEL32(sel) : VALID_USER_DSEL(sel)) |
| 1998 | sel = gr[_REG_GS] & 0xffff; |
| 1999 | if (sel != 0 && !VUG(sel)) |
| 2000 | return EINVAL; |
| 2001 | |
| 2002 | sel = gr[_REG_DS] & 0xffff; |
| 2003 | if (!VUD(sel)) |
| 2004 | return EINVAL; |
| 2005 | |
| 2006 | #ifndef XEN |
| 2007 | sel = gr[_REG_SS] & 0xffff; |
| 2008 | if (!VUD(sel)) |
| 2009 | return EINVAL; |
| 2010 | #endif |
| 2011 | |
| 2012 | } |
| 2013 | |
| 2014 | #ifndef XEN |
| 2015 | #define VUC(sel) \ |
| 2016 | ((p->p_flag & PK_32) ? VALID_USER_CSEL32(sel) : VALID_USER_CSEL(sel)) |
| 2017 | sel = gr[_REG_CS] & 0xffff; |
| 2018 | if (!VUC(sel)) |
| 2019 | return EINVAL; |
| 2020 | #endif |
| 2021 | |
| 2022 | if (gr[_REG_RIP] >= VM_MAXUSER_ADDRESS) |
| 2023 | return EINVAL; |
| 2024 | return 0; |
| 2025 | } |
| 2026 | |
| 2027 | void |
| 2028 | cpu_initclocks(void) |
| 2029 | { |
| 2030 | (*initclock_func)(); |
| 2031 | } |
| 2032 | |
| 2033 | static int |
| 2034 | valid_user_selector(struct lwp *l, uint64_t seg) |
| 2035 | { |
| 2036 | int off, len; |
| 2037 | char *dt; |
| 2038 | struct mem_segment_descriptor *sdp; |
| 2039 | struct proc *p = l->l_proc; |
| 2040 | struct pmap *pmap= p->p_vmspace->vm_map.pmap; |
| 2041 | uint64_t base; |
| 2042 | |
| 2043 | seg &= 0xffff; |
| 2044 | |
| 2045 | if (seg == 0) |
| 2046 | return 0; |
| 2047 | |
| 2048 | off = (seg & 0xfff8); |
| 2049 | if (seg & SEL_LDT) { |
| 2050 | if (pmap->pm_ldt != NULL) { |
| 2051 | len = pmap->pm_ldt_len; /* XXX broken */ |
| 2052 | dt = (char *)pmap->pm_ldt; |
| 2053 | } else { |
| 2054 | dt = ldtstore; |
| 2055 | len = LDT_SIZE; |
| 2056 | } |
| 2057 | |
| 2058 | if (off > (len - 8)) |
| 2059 | return EINVAL; |
| 2060 | } else { |
| 2061 | CTASSERT(GUDATA_SEL & SEL_LDT); |
| 2062 | KASSERT(seg != GUDATA_SEL); |
| 2063 | CTASSERT(GUDATA32_SEL & SEL_LDT); |
| 2064 | KASSERT(seg != GUDATA32_SEL); |
| 2065 | return EINVAL; |
| 2066 | } |
| 2067 | |
| 2068 | sdp = (struct mem_segment_descriptor *)(dt + off); |
| 2069 | if (sdp->sd_type < SDT_MEMRO || sdp->sd_p == 0) |
| 2070 | return EINVAL; |
| 2071 | |
| 2072 | base = ((uint64_t)sdp->sd_hibase << 32) | ((uint64_t)sdp->sd_lobase); |
| 2073 | if (sdp->sd_gran == 1) |
| 2074 | base <<= PAGE_SHIFT; |
| 2075 | |
| 2076 | if (base >= VM_MAXUSER_ADDRESS) |
| 2077 | return EINVAL; |
| 2078 | |
| 2079 | return 0; |
| 2080 | } |
| 2081 | |
| 2082 | int |
| 2083 | mm_md_kernacc(void *ptr, vm_prot_t prot, bool *handled) |
| 2084 | { |
| 2085 | extern int start, __data_start; |
| 2086 | const vaddr_t v = (vaddr_t)ptr; |
| 2087 | |
| 2088 | if (v >= (vaddr_t)&start && v < (vaddr_t)kern_end) { |
| 2089 | *handled = true; |
| 2090 | /* Either the text or rodata segment */ |
| 2091 | if (v < (vaddr_t)&__data_start && (prot & VM_PROT_WRITE)) |
| 2092 | return EFAULT; |
| 2093 | |
| 2094 | } else if (v >= module_start && v < module_end) { |
| 2095 | *handled = true; |
| 2096 | if (!uvm_map_checkprot(module_map, v, v + 1, prot)) |
| 2097 | return EFAULT; |
| 2098 | } else { |
| 2099 | *handled = false; |
| 2100 | } |
| 2101 | return 0; |
| 2102 | } |
| 2103 | |
| 2104 | /* |
| 2105 | * Zero out an LWP's TLS context (%fs and %gs and associated stuff). |
| 2106 | * Used when exec'ing a new program. |
| 2107 | */ |
| 2108 | |
| 2109 | void |
| 2110 | cpu_fsgs_zero(struct lwp *l) |
| 2111 | { |
| 2112 | struct trapframe * const tf = l->l_md.md_regs; |
| 2113 | struct pcb *pcb; |
| 2114 | uint64_t zero = 0; |
| 2115 | |
| 2116 | pcb = lwp_getpcb(l); |
| 2117 | if (l == curlwp) { |
| 2118 | kpreempt_disable(); |
| 2119 | tf->tf_fs = 0; |
| 2120 | tf->tf_gs = 0; |
| 2121 | setfs(0); |
| 2122 | #ifndef XEN |
| 2123 | setusergs(0); |
| 2124 | #else |
| 2125 | HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, 0); |
| 2126 | #endif |
| 2127 | if ((l->l_proc->p_flag & PK_32) == 0) { |
| 2128 | #ifndef XEN |
| 2129 | wrmsr(MSR_FSBASE, 0); |
| 2130 | wrmsr(MSR_KERNELGSBASE, 0); |
| 2131 | #else |
| 2132 | HYPERVISOR_set_segment_base(SEGBASE_FS, 0); |
| 2133 | HYPERVISOR_set_segment_base(SEGBASE_GS_USER, 0); |
| 2134 | #endif |
| 2135 | } |
| 2136 | pcb->pcb_fs = 0; |
| 2137 | pcb->pcb_gs = 0; |
| 2138 | update_descriptor(&curcpu()->ci_gdt[GUFS_SEL], &zero); |
| 2139 | update_descriptor(&curcpu()->ci_gdt[GUGS_SEL], &zero); |
| 2140 | kpreempt_enable(); |
| 2141 | } else { |
| 2142 | tf->tf_fs = 0; |
| 2143 | tf->tf_gs = 0; |
| 2144 | pcb->pcb_fs = 0; |
| 2145 | pcb->pcb_gs = 0; |
| 2146 | } |
| 2147 | |
| 2148 | } |
| 2149 | |
| 2150 | /* |
| 2151 | * Load an LWP's TLS context, possibly changing the %fs and %gs selectors. |
| 2152 | * Used only for 32-bit processes. |
| 2153 | */ |
| 2154 | |
| 2155 | void |
| 2156 | cpu_fsgs_reload(struct lwp *l, int fssel, int gssel) |
| 2157 | { |
| 2158 | struct trapframe *tf; |
| 2159 | struct pcb *pcb; |
| 2160 | |
| 2161 | KASSERT(l->l_proc->p_flag & PK_32); |
| 2162 | tf = l->l_md.md_regs; |
| 2163 | if (l == curlwp) { |
| 2164 | pcb = lwp_getpcb(l); |
| 2165 | kpreempt_disable(); |
| 2166 | update_descriptor(&curcpu()->ci_gdt[GUFS_SEL], &pcb->pcb_fs); |
| 2167 | update_descriptor(&curcpu()->ci_gdt[GUGS_SEL], &pcb->pcb_gs); |
| 2168 | setfs(fssel); |
| 2169 | #ifndef XEN |
| 2170 | setusergs(gssel); |
| 2171 | #else |
| 2172 | HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, gssel); |
| 2173 | #endif |
| 2174 | tf->tf_fs = fssel; |
| 2175 | tf->tf_gs = gssel; |
| 2176 | kpreempt_enable(); |
| 2177 | } else { |
| 2178 | tf->tf_fs = fssel; |
| 2179 | tf->tf_gs = gssel; |
| 2180 | } |
| 2181 | } |
| 2182 | |
| 2183 | |
| 2184 | #ifdef __HAVE_DIRECT_MAP |
| 2185 | bool |
| 2186 | mm_md_direct_mapped_io(void *addr, paddr_t *paddr) |
| 2187 | { |
| 2188 | vaddr_t va = (vaddr_t)addr; |
| 2189 | |
| 2190 | if (va >= PMAP_DIRECT_BASE && va < PMAP_DIRECT_END) { |
| 2191 | *paddr = PMAP_DIRECT_UNMAP(va); |
| 2192 | return true; |
| 2193 | } |
| 2194 | return false; |
| 2195 | } |
| 2196 | |
| 2197 | bool |
| 2198 | mm_md_direct_mapped_phys(paddr_t paddr, vaddr_t *vaddr) |
| 2199 | { |
| 2200 | *vaddr = PMAP_DIRECT_MAP(paddr); |
| 2201 | return true; |
| 2202 | } |
| 2203 | #endif |
| 2204 | |