Version:  2.0.40 2.2.26 2.4.37 3.1 3.2 3.3 3.4 3.5 3.6 3.7 3.8 3.9 3.10 3.11 3.12 3.13 3.14 3.15 3.16 3.17

Linux/arch/x86/kernel/process_64.c

  1 /*
  2  *  Copyright (C) 1995  Linus Torvalds
  3  *
  4  *  Pentium III FXSR, SSE support
  5  *      Gareth Hughes <gareth@valinux.com>, May 2000
  6  *
  7  *  X86-64 port
  8  *      Andi Kleen.
  9  *
 10  *      CPU hotplug support - ashok.raj@intel.com
 11  */
 12 
 13 /*
 14  * This file handles the architecture-dependent parts of process handling..
 15  */
 16 
 17 #include <linux/cpu.h>
 18 #include <linux/errno.h>
 19 #include <linux/sched.h>
 20 #include <linux/fs.h>
 21 #include <linux/kernel.h>
 22 #include <linux/mm.h>
 23 #include <linux/elfcore.h>
 24 #include <linux/smp.h>
 25 #include <linux/slab.h>
 26 #include <linux/user.h>
 27 #include <linux/interrupt.h>
 28 #include <linux/delay.h>
 29 #include <linux/module.h>
 30 #include <linux/ptrace.h>
 31 #include <linux/notifier.h>
 32 #include <linux/kprobes.h>
 33 #include <linux/kdebug.h>
 34 #include <linux/prctl.h>
 35 #include <linux/uaccess.h>
 36 #include <linux/io.h>
 37 #include <linux/ftrace.h>
 38 
 39 #include <asm/pgtable.h>
 40 #include <asm/processor.h>
 41 #include <asm/i387.h>
 42 #include <asm/fpu-internal.h>
 43 #include <asm/mmu_context.h>
 44 #include <asm/prctl.h>
 45 #include <asm/desc.h>
 46 #include <asm/proto.h>
 47 #include <asm/ia32.h>
 48 #include <asm/idle.h>
 49 #include <asm/syscalls.h>
 50 #include <asm/debugreg.h>
 51 #include <asm/switch_to.h>
 52 
 53 asmlinkage extern void ret_from_fork(void);
 54 
 55 __visible DEFINE_PER_CPU(unsigned long, old_rsp);
 56 
 57 /* Prints also some state that isn't saved in the pt_regs */
 58 void __show_regs(struct pt_regs *regs, int all)
 59 {
 60         unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
 61         unsigned long d0, d1, d2, d3, d6, d7;
 62         unsigned int fsindex, gsindex;
 63         unsigned int ds, cs, es;
 64 
 65         printk(KERN_DEFAULT "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
 66         printk_address(regs->ip);
 67         printk(KERN_DEFAULT "RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss,
 68                         regs->sp, regs->flags);
 69         printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n",
 70                regs->ax, regs->bx, regs->cx);
 71         printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n",
 72                regs->dx, regs->si, regs->di);
 73         printk(KERN_DEFAULT "RBP: %016lx R08: %016lx R09: %016lx\n",
 74                regs->bp, regs->r8, regs->r9);
 75         printk(KERN_DEFAULT "R10: %016lx R11: %016lx R12: %016lx\n",
 76                regs->r10, regs->r11, regs->r12);
 77         printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
 78                regs->r13, regs->r14, regs->r15);
 79 
 80         asm("movl %%ds,%0" : "=r" (ds));
 81         asm("movl %%cs,%0" : "=r" (cs));
 82         asm("movl %%es,%0" : "=r" (es));
 83         asm("movl %%fs,%0" : "=r" (fsindex));
 84         asm("movl %%gs,%0" : "=r" (gsindex));
 85 
 86         rdmsrl(MSR_FS_BASE, fs);
 87         rdmsrl(MSR_GS_BASE, gs);
 88         rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
 89 
 90         if (!all)
 91                 return;
 92 
 93         cr0 = read_cr0();
 94         cr2 = read_cr2();
 95         cr3 = read_cr3();
 96         cr4 = read_cr4();
 97 
 98         printk(KERN_DEFAULT "FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
 99                fs, fsindex, gs, gsindex, shadowgs);
100         printk(KERN_DEFAULT "CS:  %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
101                         es, cr0);
102         printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
103                         cr4);
104 
105         get_debugreg(d0, 0);
106         get_debugreg(d1, 1);
107         get_debugreg(d2, 2);
108         get_debugreg(d3, 3);
109         get_debugreg(d6, 6);
110         get_debugreg(d7, 7);
111 
112         /* Only print out debug registers if they are in their non-default state. */
113         if ((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) &&
114             (d6 == DR6_RESERVED) && (d7 == 0x400))
115                 return;
116 
117         printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
118         printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
119 
120 }
121 
122 void release_thread(struct task_struct *dead_task)
123 {
124         if (dead_task->mm) {
125                 if (dead_task->mm->context.size) {
126                         pr_warn("WARNING: dead process %s still has LDT? <%p/%d>\n",
127                                 dead_task->comm,
128                                 dead_task->mm->context.ldt,
129                                 dead_task->mm->context.size);
130                         BUG();
131                 }
132         }
133 }
134 
135 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
136 {
137         struct user_desc ud = {
138                 .base_addr = addr,
139                 .limit = 0xfffff,
140                 .seg_32bit = 1,
141                 .limit_in_pages = 1,
142                 .useable = 1,
143         };
144         struct desc_struct *desc = t->thread.tls_array;
145         desc += tls;
146         fill_ldt(desc, &ud);
147 }
148 
149 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
150 {
151         return get_desc_base(&t->thread.tls_array[tls]);
152 }
153 
154 int copy_thread(unsigned long clone_flags, unsigned long sp,
155                 unsigned long arg, struct task_struct *p)
156 {
157         int err;
158         struct pt_regs *childregs;
159         struct task_struct *me = current;
160 
161         p->thread.sp0 = (unsigned long)task_stack_page(p) + THREAD_SIZE;
162         childregs = task_pt_regs(p);
163         p->thread.sp = (unsigned long) childregs;
164         p->thread.usersp = me->thread.usersp;
165         set_tsk_thread_flag(p, TIF_FORK);
166         p->thread.fpu_counter = 0;
167         p->thread.io_bitmap_ptr = NULL;
168 
169         savesegment(gs, p->thread.gsindex);
170         p->thread.gs = p->thread.gsindex ? 0 : me->thread.gs;
171         savesegment(fs, p->thread.fsindex);
172         p->thread.fs = p->thread.fsindex ? 0 : me->thread.fs;
173         savesegment(es, p->thread.es);
174         savesegment(ds, p->thread.ds);
175         memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
176 
177         if (unlikely(p->flags & PF_KTHREAD)) {
178                 /* kernel thread */
179                 memset(childregs, 0, sizeof(struct pt_regs));
180                 childregs->sp = (unsigned long)childregs;
181                 childregs->ss = __KERNEL_DS;
182                 childregs->bx = sp; /* function */
183                 childregs->bp = arg;
184                 childregs->orig_ax = -1;
185                 childregs->cs = __KERNEL_CS | get_kernel_rpl();
186                 childregs->flags = X86_EFLAGS_IF | X86_EFLAGS_FIXED;
187                 return 0;
188         }
189         *childregs = *current_pt_regs();
190 
191         childregs->ax = 0;
192         if (sp)
193                 childregs->sp = sp;
194 
195         err = -ENOMEM;
196         memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
197 
198         if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
199                 p->thread.io_bitmap_ptr = kmemdup(me->thread.io_bitmap_ptr,
200                                                   IO_BITMAP_BYTES, GFP_KERNEL);
201                 if (!p->thread.io_bitmap_ptr) {
202                         p->thread.io_bitmap_max = 0;
203                         return -ENOMEM;
204                 }
205                 set_tsk_thread_flag(p, TIF_IO_BITMAP);
206         }
207 
208         /*
209          * Set a new TLS for the child thread?
210          */
211         if (clone_flags & CLONE_SETTLS) {
212 #ifdef CONFIG_IA32_EMULATION
213                 if (test_thread_flag(TIF_IA32))
214                         err = do_set_thread_area(p, -1,
215                                 (struct user_desc __user *)childregs->si, 0);
216                 else
217 #endif
218                         err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
219                 if (err)
220                         goto out;
221         }
222         err = 0;
223 out:
224         if (err && p->thread.io_bitmap_ptr) {
225                 kfree(p->thread.io_bitmap_ptr);
226                 p->thread.io_bitmap_max = 0;
227         }
228 
229         return err;
230 }
231 
232 static void
233 start_thread_common(struct pt_regs *regs, unsigned long new_ip,
234                     unsigned long new_sp,
235                     unsigned int _cs, unsigned int _ss, unsigned int _ds)
236 {
237         loadsegment(fs, 0);
238         loadsegment(es, _ds);
239         loadsegment(ds, _ds);
240         load_gs_index(0);
241         current->thread.usersp  = new_sp;
242         regs->ip                = new_ip;
243         regs->sp                = new_sp;
244         this_cpu_write(old_rsp, new_sp);
245         regs->cs                = _cs;
246         regs->ss                = _ss;
247         regs->flags             = X86_EFLAGS_IF;
248 }
249 
250 void
251 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
252 {
253         start_thread_common(regs, new_ip, new_sp,
254                             __USER_CS, __USER_DS, 0);
255 }
256 
257 #ifdef CONFIG_IA32_EMULATION
258 void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp)
259 {
260         start_thread_common(regs, new_ip, new_sp,
261                             test_thread_flag(TIF_X32)
262                             ? __USER_CS : __USER32_CS,
263                             __USER_DS, __USER_DS);
264 }
265 #endif
266 
267 /*
268  *      switch_to(x,y) should switch tasks from x to y.
269  *
270  * This could still be optimized:
271  * - fold all the options into a flag word and test it with a single test.
272  * - could test fs/gs bitsliced
273  *
274  * Kprobes not supported here. Set the probe on schedule instead.
275  * Function graph tracer not supported too.
276  */
277 __visible __notrace_funcgraph struct task_struct *
278 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
279 {
280         struct thread_struct *prev = &prev_p->thread;
281         struct thread_struct *next = &next_p->thread;
282         int cpu = smp_processor_id();
283         struct tss_struct *tss = &per_cpu(init_tss, cpu);
284         unsigned fsindex, gsindex;
285         fpu_switch_t fpu;
286 
287         fpu = switch_fpu_prepare(prev_p, next_p, cpu);
288 
289         /*
290          * Reload esp0, LDT and the page table pointer:
291          */
292         load_sp0(tss, next);
293 
294         /*
295          * Switch DS and ES.
296          * This won't pick up thread selector changes, but I guess that is ok.
297          */
298         savesegment(es, prev->es);
299         if (unlikely(next->es | prev->es))
300                 loadsegment(es, next->es);
301 
302         savesegment(ds, prev->ds);
303         if (unlikely(next->ds | prev->ds))
304                 loadsegment(ds, next->ds);
305 
306 
307         /* We must save %fs and %gs before load_TLS() because
308          * %fs and %gs may be cleared by load_TLS().
309          *
310          * (e.g. xen_load_tls())
311          */
312         savesegment(fs, fsindex);
313         savesegment(gs, gsindex);
314 
315         load_TLS(next, cpu);
316 
317         /*
318          * Leave lazy mode, flushing any hypercalls made here.
319          * This must be done before restoring TLS segments so
320          * the GDT and LDT are properly updated, and must be
321          * done before math_state_restore, so the TS bit is up
322          * to date.
323          */
324         arch_end_context_switch(next_p);
325 
326         /*
327          * Switch FS and GS.
328          *
329          * Segment register != 0 always requires a reload.  Also
330          * reload when it has changed.  When prev process used 64bit
331          * base always reload to avoid an information leak.
332          */
333         if (unlikely(fsindex | next->fsindex | prev->fs)) {
334                 loadsegment(fs, next->fsindex);
335                 /*
336                  * Check if the user used a selector != 0; if yes
337                  *  clear 64bit base, since overloaded base is always
338                  *  mapped to the Null selector
339                  */
340                 if (fsindex)
341                         prev->fs = 0;
342         }
343         /* when next process has a 64bit base use it */
344         if (next->fs)
345                 wrmsrl(MSR_FS_BASE, next->fs);
346         prev->fsindex = fsindex;
347 
348         if (unlikely(gsindex | next->gsindex | prev->gs)) {
349                 load_gs_index(next->gsindex);
350                 if (gsindex)
351                         prev->gs = 0;
352         }
353         if (next->gs)
354                 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
355         prev->gsindex = gsindex;
356 
357         switch_fpu_finish(next_p, fpu);
358 
359         /*
360          * Switch the PDA and FPU contexts.
361          */
362         prev->usersp = this_cpu_read(old_rsp);
363         this_cpu_write(old_rsp, next->usersp);
364         this_cpu_write(current_task, next_p);
365 
366         /*
367          * If it were not for PREEMPT_ACTIVE we could guarantee that the
368          * preempt_count of all tasks was equal here and this would not be
369          * needed.
370          */
371         task_thread_info(prev_p)->saved_preempt_count = this_cpu_read(__preempt_count);
372         this_cpu_write(__preempt_count, task_thread_info(next_p)->saved_preempt_count);
373 
374         this_cpu_write(kernel_stack,
375                   (unsigned long)task_stack_page(next_p) +
376                   THREAD_SIZE - KERNEL_STACK_OFFSET);
377 
378         /*
379          * Now maybe reload the debug registers and handle I/O bitmaps
380          */
381         if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
382                      task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
383                 __switch_to_xtra(prev_p, next_p, tss);
384 
385         return prev_p;
386 }
387 
388 void set_personality_64bit(void)
389 {
390         /* inherit personality from parent */
391 
392         /* Make sure to be in 64bit mode */
393         clear_thread_flag(TIF_IA32);
394         clear_thread_flag(TIF_ADDR32);
395         clear_thread_flag(TIF_X32);
396 
397         /* Ensure the corresponding mm is not marked. */
398         if (current->mm)
399                 current->mm->context.ia32_compat = 0;
400 
401         /* TBD: overwrites user setup. Should have two bits.
402            But 64bit processes have always behaved this way,
403            so it's not too bad. The main problem is just that
404            32bit childs are affected again. */
405         current->personality &= ~READ_IMPLIES_EXEC;
406 }
407 
408 void set_personality_ia32(bool x32)
409 {
410         /* inherit personality from parent */
411 
412         /* Make sure to be in 32bit mode */
413         set_thread_flag(TIF_ADDR32);
414 
415         /* Mark the associated mm as containing 32-bit tasks. */
416         if (x32) {
417                 clear_thread_flag(TIF_IA32);
418                 set_thread_flag(TIF_X32);
419                 if (current->mm)
420                         current->mm->context.ia32_compat = TIF_X32;
421                 current->personality &= ~READ_IMPLIES_EXEC;
422                 /* is_compat_task() uses the presence of the x32
423                    syscall bit flag to determine compat status */
424                 current_thread_info()->status &= ~TS_COMPAT;
425         } else {
426                 set_thread_flag(TIF_IA32);
427                 clear_thread_flag(TIF_X32);
428                 if (current->mm)
429                         current->mm->context.ia32_compat = TIF_IA32;
430                 current->personality |= force_personality32;
431                 /* Prepare the first "return" to user space */
432                 current_thread_info()->status |= TS_COMPAT;
433         }
434 }
435 EXPORT_SYMBOL_GPL(set_personality_ia32);
436 
437 unsigned long get_wchan(struct task_struct *p)
438 {
439         unsigned long stack;
440         u64 fp, ip;
441         int count = 0;
442 
443         if (!p || p == current || p->state == TASK_RUNNING)
444                 return 0;
445         stack = (unsigned long)task_stack_page(p);
446         if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE)
447                 return 0;
448         fp = *(u64 *)(p->thread.sp);
449         do {
450                 if (fp < (unsigned long)stack ||
451                     fp >= (unsigned long)stack+THREAD_SIZE)
452                         return 0;
453                 ip = *(u64 *)(fp+8);
454                 if (!in_sched_functions(ip))
455                         return ip;
456                 fp = *(u64 *)fp;
457         } while (count++ < 16);
458         return 0;
459 }
460 
461 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
462 {
463         int ret = 0;
464         int doit = task == current;
465         int cpu;
466 
467         switch (code) {
468         case ARCH_SET_GS:
469                 if (addr >= TASK_SIZE_OF(task))
470                         return -EPERM;
471                 cpu = get_cpu();
472                 /* handle small bases via the GDT because that's faster to
473                    switch. */
474                 if (addr <= 0xffffffff) {
475                         set_32bit_tls(task, GS_TLS, addr);
476                         if (doit) {
477                                 load_TLS(&task->thread, cpu);
478                                 load_gs_index(GS_TLS_SEL);
479                         }
480                         task->thread.gsindex = GS_TLS_SEL;
481                         task->thread.gs = 0;
482                 } else {
483                         task->thread.gsindex = 0;
484                         task->thread.gs = addr;
485                         if (doit) {
486                                 load_gs_index(0);
487                                 ret = wrmsrl_safe(MSR_KERNEL_GS_BASE, addr);
488                         }
489                 }
490                 put_cpu();
491                 break;
492         case ARCH_SET_FS:
493                 /* Not strictly needed for fs, but do it for symmetry
494                    with gs */
495                 if (addr >= TASK_SIZE_OF(task))
496                         return -EPERM;
497                 cpu = get_cpu();
498                 /* handle small bases via the GDT because that's faster to
499                    switch. */
500                 if (addr <= 0xffffffff) {
501                         set_32bit_tls(task, FS_TLS, addr);
502                         if (doit) {
503                                 load_TLS(&task->thread, cpu);
504                                 loadsegment(fs, FS_TLS_SEL);
505                         }
506                         task->thread.fsindex = FS_TLS_SEL;
507                         task->thread.fs = 0;
508                 } else {
509                         task->thread.fsindex = 0;
510                         task->thread.fs = addr;
511                         if (doit) {
512                                 /* set the selector to 0 to not confuse
513                                    __switch_to */
514                                 loadsegment(fs, 0);
515                                 ret = wrmsrl_safe(MSR_FS_BASE, addr);
516                         }
517                 }
518                 put_cpu();
519                 break;
520         case ARCH_GET_FS: {
521                 unsigned long base;
522                 if (task->thread.fsindex == FS_TLS_SEL)
523                         base = read_32bit_tls(task, FS_TLS);
524                 else if (doit)
525                         rdmsrl(MSR_FS_BASE, base);
526                 else
527                         base = task->thread.fs;
528                 ret = put_user(base, (unsigned long __user *)addr);
529                 break;
530         }
531         case ARCH_GET_GS: {
532                 unsigned long base;
533                 unsigned gsindex;
534                 if (task->thread.gsindex == GS_TLS_SEL)
535                         base = read_32bit_tls(task, GS_TLS);
536                 else if (doit) {
537                         savesegment(gs, gsindex);
538                         if (gsindex)
539                                 rdmsrl(MSR_KERNEL_GS_BASE, base);
540                         else
541                                 base = task->thread.gs;
542                 } else
543                         base = task->thread.gs;
544                 ret = put_user(base, (unsigned long __user *)addr);
545                 break;
546         }
547 
548         default:
549                 ret = -EINVAL;
550                 break;
551         }
552 
553         return ret;
554 }
555 
556 long sys_arch_prctl(int code, unsigned long addr)
557 {
558         return do_arch_prctl(current, code, addr);
559 }
560 
561 unsigned long KSTK_ESP(struct task_struct *task)
562 {
563         return (test_tsk_thread_flag(task, TIF_IA32)) ?
564                         (task_pt_regs(task)->sp) : ((task)->thread.usersp);
565 }
566 

This page was automatically generated by LXR 0.3.1 (source).  •  Linux is a registered trademark of Linus Torvalds  •  Contact us