Version:  2.0.40 2.2.26 2.4.37 3.9 3.10 3.11 3.12 3.13 3.14 3.15 3.16 3.17 3.18 3.19 4.0 4.1 4.2 4.3 4.4 4.5 4.6

Linux/kernel/fork.c

  1 /*
  2  *  linux/kernel/fork.c
  3  *
  4  *  Copyright (C) 1991, 1992  Linus Torvalds
  5  */
  6 
  7 /*
  8  *  'fork.c' contains the help-routines for the 'fork' system call
  9  * (see also entry.S and others).
 10  * Fork is rather simple, once you get the hang of it, but the memory
 11  * management can be a bitch. See 'mm/memory.c': 'copy_page_range()'
 12  */
 13 
 14 #include <linux/slab.h>
 15 #include <linux/init.h>
 16 #include <linux/unistd.h>
 17 #include <linux/module.h>
 18 #include <linux/vmalloc.h>
 19 #include <linux/completion.h>
 20 #include <linux/personality.h>
 21 #include <linux/mempolicy.h>
 22 #include <linux/sem.h>
 23 #include <linux/file.h>
 24 #include <linux/fdtable.h>
 25 #include <linux/iocontext.h>
 26 #include <linux/key.h>
 27 #include <linux/binfmts.h>
 28 #include <linux/mman.h>
 29 #include <linux/mmu_notifier.h>
 30 #include <linux/fs.h>
 31 #include <linux/mm.h>
 32 #include <linux/vmacache.h>
 33 #include <linux/nsproxy.h>
 34 #include <linux/capability.h>
 35 #include <linux/cpu.h>
 36 #include <linux/cgroup.h>
 37 #include <linux/security.h>
 38 #include <linux/hugetlb.h>
 39 #include <linux/seccomp.h>
 40 #include <linux/swap.h>
 41 #include <linux/syscalls.h>
 42 #include <linux/jiffies.h>
 43 #include <linux/futex.h>
 44 #include <linux/compat.h>
 45 #include <linux/kthread.h>
 46 #include <linux/task_io_accounting_ops.h>
 47 #include <linux/rcupdate.h>
 48 #include <linux/ptrace.h>
 49 #include <linux/mount.h>
 50 #include <linux/audit.h>
 51 #include <linux/memcontrol.h>
 52 #include <linux/ftrace.h>
 53 #include <linux/proc_fs.h>
 54 #include <linux/profile.h>
 55 #include <linux/rmap.h>
 56 #include <linux/ksm.h>
 57 #include <linux/acct.h>
 58 #include <linux/tsacct_kern.h>
 59 #include <linux/cn_proc.h>
 60 #include <linux/freezer.h>
 61 #include <linux/delayacct.h>
 62 #include <linux/taskstats_kern.h>
 63 #include <linux/random.h>
 64 #include <linux/tty.h>
 65 #include <linux/blkdev.h>
 66 #include <linux/fs_struct.h>
 67 #include <linux/magic.h>
 68 #include <linux/perf_event.h>
 69 #include <linux/posix-timers.h>
 70 #include <linux/user-return-notifier.h>
 71 #include <linux/oom.h>
 72 #include <linux/khugepaged.h>
 73 #include <linux/signalfd.h>
 74 #include <linux/uprobes.h>
 75 #include <linux/aio.h>
 76 #include <linux/compiler.h>
 77 #include <linux/sysctl.h>
 78 #include <linux/kcov.h>
 79 
 80 #include <asm/pgtable.h>
 81 #include <asm/pgalloc.h>
 82 #include <asm/uaccess.h>
 83 #include <asm/mmu_context.h>
 84 #include <asm/cacheflush.h>
 85 #include <asm/tlbflush.h>
 86 
 87 #include <trace/events/sched.h>
 88 
 89 #define CREATE_TRACE_POINTS
 90 #include <trace/events/task.h>
 91 
 92 /*
 93  * Minimum number of threads to boot the kernel
 94  */
 95 #define MIN_THREADS 20
 96 
 97 /*
 98  * Maximum number of threads
 99  */
100 #define MAX_THREADS FUTEX_TID_MASK
101 
102 /*
103  * Protected counters by write_lock_irq(&tasklist_lock)
104  */
105 unsigned long total_forks;      /* Handle normal Linux uptimes. */
106 int nr_threads;                 /* The idle threads do not count.. */
107 
108 int max_threads;                /* tunable limit on nr_threads */
109 
110 DEFINE_PER_CPU(unsigned long, process_counts) = 0;
111 
112 __cacheline_aligned DEFINE_RWLOCK(tasklist_lock);  /* outer */
113 
114 #ifdef CONFIG_PROVE_RCU
115 int lockdep_tasklist_lock_is_held(void)
116 {
117         return lockdep_is_held(&tasklist_lock);
118 }
119 EXPORT_SYMBOL_GPL(lockdep_tasklist_lock_is_held);
120 #endif /* #ifdef CONFIG_PROVE_RCU */
121 
122 int nr_processes(void)
123 {
124         int cpu;
125         int total = 0;
126 
127         for_each_possible_cpu(cpu)
128                 total += per_cpu(process_counts, cpu);
129 
130         return total;
131 }
132 
133 void __weak arch_release_task_struct(struct task_struct *tsk)
134 {
135 }
136 
137 #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
138 static struct kmem_cache *task_struct_cachep;
139 
140 static inline struct task_struct *alloc_task_struct_node(int node)
141 {
142         return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node);
143 }
144 
145 static inline void free_task_struct(struct task_struct *tsk)
146 {
147         kmem_cache_free(task_struct_cachep, tsk);
148 }
149 #endif
150 
151 void __weak arch_release_thread_info(struct thread_info *ti)
152 {
153 }
154 
155 #ifndef CONFIG_ARCH_THREAD_INFO_ALLOCATOR
156 
157 /*
158  * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
159  * kmemcache based allocator.
160  */
161 # if THREAD_SIZE >= PAGE_SIZE
162 static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
163                                                   int node)
164 {
165         struct page *page = alloc_kmem_pages_node(node, THREADINFO_GFP,
166                                                   THREAD_SIZE_ORDER);
167 
168         if (page)
169                 memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK,
170                                             1 << THREAD_SIZE_ORDER);
171 
172         return page ? page_address(page) : NULL;
173 }
174 
175 static inline void free_thread_info(struct thread_info *ti)
176 {
177         struct page *page = virt_to_page(ti);
178 
179         memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK,
180                                     -(1 << THREAD_SIZE_ORDER));
181         __free_kmem_pages(page, THREAD_SIZE_ORDER);
182 }
183 # else
184 static struct kmem_cache *thread_info_cache;
185 
186 static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
187                                                   int node)
188 {
189         return kmem_cache_alloc_node(thread_info_cache, THREADINFO_GFP, node);
190 }
191 
192 static void free_thread_info(struct thread_info *ti)
193 {
194         kmem_cache_free(thread_info_cache, ti);
195 }
196 
197 void thread_info_cache_init(void)
198 {
199         thread_info_cache = kmem_cache_create("thread_info", THREAD_SIZE,
200                                               THREAD_SIZE, 0, NULL);
201         BUG_ON(thread_info_cache == NULL);
202 }
203 # endif
204 #endif
205 
206 /* SLAB cache for signal_struct structures (tsk->signal) */
207 static struct kmem_cache *signal_cachep;
208 
209 /* SLAB cache for sighand_struct structures (tsk->sighand) */
210 struct kmem_cache *sighand_cachep;
211 
212 /* SLAB cache for files_struct structures (tsk->files) */
213 struct kmem_cache *files_cachep;
214 
215 /* SLAB cache for fs_struct structures (tsk->fs) */
216 struct kmem_cache *fs_cachep;
217 
218 /* SLAB cache for vm_area_struct structures */
219 struct kmem_cache *vm_area_cachep;
220 
221 /* SLAB cache for mm_struct structures (tsk->mm) */
222 static struct kmem_cache *mm_cachep;
223 
224 static void account_kernel_stack(struct thread_info *ti, int account)
225 {
226         struct zone *zone = page_zone(virt_to_page(ti));
227 
228         mod_zone_page_state(zone, NR_KERNEL_STACK, account);
229 }
230 
231 void free_task(struct task_struct *tsk)
232 {
233         account_kernel_stack(tsk->stack, -1);
234         arch_release_thread_info(tsk->stack);
235         free_thread_info(tsk->stack);
236         rt_mutex_debug_task_free(tsk);
237         ftrace_graph_exit_task(tsk);
238         put_seccomp_filter(tsk);
239         arch_release_task_struct(tsk);
240         free_task_struct(tsk);
241 }
242 EXPORT_SYMBOL(free_task);
243 
244 static inline void free_signal_struct(struct signal_struct *sig)
245 {
246         taskstats_tgid_free(sig);
247         sched_autogroup_exit(sig);
248         kmem_cache_free(signal_cachep, sig);
249 }
250 
251 static inline void put_signal_struct(struct signal_struct *sig)
252 {
253         if (atomic_dec_and_test(&sig->sigcnt))
254                 free_signal_struct(sig);
255 }
256 
257 void __put_task_struct(struct task_struct *tsk)
258 {
259         WARN_ON(!tsk->exit_state);
260         WARN_ON(atomic_read(&tsk->usage));
261         WARN_ON(tsk == current);
262 
263         cgroup_free(tsk);
264         task_numa_free(tsk);
265         security_task_free(tsk);
266         exit_creds(tsk);
267         delayacct_tsk_free(tsk);
268         put_signal_struct(tsk->signal);
269 
270         if (!profile_handoff_task(tsk))
271                 free_task(tsk);
272 }
273 EXPORT_SYMBOL_GPL(__put_task_struct);
274 
275 void __init __weak arch_task_cache_init(void) { }
276 
277 /*
278  * set_max_threads
279  */
280 static void set_max_threads(unsigned int max_threads_suggested)
281 {
282         u64 threads;
283 
284         /*
285          * The number of threads shall be limited such that the thread
286          * structures may only consume a small part of the available memory.
287          */
288         if (fls64(totalram_pages) + fls64(PAGE_SIZE) > 64)
289                 threads = MAX_THREADS;
290         else
291                 threads = div64_u64((u64) totalram_pages * (u64) PAGE_SIZE,
292                                     (u64) THREAD_SIZE * 8UL);
293 
294         if (threads > max_threads_suggested)
295                 threads = max_threads_suggested;
296 
297         max_threads = clamp_t(u64, threads, MIN_THREADS, MAX_THREADS);
298 }
299 
300 #ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT
301 /* Initialized by the architecture: */
302 int arch_task_struct_size __read_mostly;
303 #endif
304 
305 void __init fork_init(void)
306 {
307 #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
308 #ifndef ARCH_MIN_TASKALIGN
309 #define ARCH_MIN_TASKALIGN      L1_CACHE_BYTES
310 #endif
311         /* create a slab on which task_structs can be allocated */
312         task_struct_cachep = kmem_cache_create("task_struct",
313                         arch_task_struct_size, ARCH_MIN_TASKALIGN,
314                         SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, NULL);
315 #endif
316 
317         /* do the arch specific task caches init */
318         arch_task_cache_init();
319 
320         set_max_threads(MAX_THREADS);
321 
322         init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2;
323         init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
324         init_task.signal->rlim[RLIMIT_SIGPENDING] =
325                 init_task.signal->rlim[RLIMIT_NPROC];
326 }
327 
328 int __weak arch_dup_task_struct(struct task_struct *dst,
329                                                struct task_struct *src)
330 {
331         *dst = *src;
332         return 0;
333 }
334 
335 void set_task_stack_end_magic(struct task_struct *tsk)
336 {
337         unsigned long *stackend;
338 
339         stackend = end_of_stack(tsk);
340         *stackend = STACK_END_MAGIC;    /* for overflow detection */
341 }
342 
343 static struct task_struct *dup_task_struct(struct task_struct *orig)
344 {
345         struct task_struct *tsk;
346         struct thread_info *ti;
347         int node = tsk_fork_get_node(orig);
348         int err;
349 
350         tsk = alloc_task_struct_node(node);
351         if (!tsk)
352                 return NULL;
353 
354         ti = alloc_thread_info_node(tsk, node);
355         if (!ti)
356                 goto free_tsk;
357 
358         err = arch_dup_task_struct(tsk, orig);
359         if (err)
360                 goto free_ti;
361 
362         tsk->stack = ti;
363 #ifdef CONFIG_SECCOMP
364         /*
365          * We must handle setting up seccomp filters once we're under
366          * the sighand lock in case orig has changed between now and
367          * then. Until then, filter must be NULL to avoid messing up
368          * the usage counts on the error path calling free_task.
369          */
370         tsk->seccomp.filter = NULL;
371 #endif
372 
373         setup_thread_stack(tsk, orig);
374         clear_user_return_notifier(tsk);
375         clear_tsk_need_resched(tsk);
376         set_task_stack_end_magic(tsk);
377 
378 #ifdef CONFIG_CC_STACKPROTECTOR
379         tsk->stack_canary = get_random_int();
380 #endif
381 
382         /*
383          * One for us, one for whoever does the "release_task()" (usually
384          * parent)
385          */
386         atomic_set(&tsk->usage, 2);
387 #ifdef CONFIG_BLK_DEV_IO_TRACE
388         tsk->btrace_seq = 0;
389 #endif
390         tsk->splice_pipe = NULL;
391         tsk->task_frag.page = NULL;
392         tsk->wake_q.next = NULL;
393 
394         account_kernel_stack(ti, 1);
395 
396         kcov_task_init(tsk);
397 
398         return tsk;
399 
400 free_ti:
401         free_thread_info(ti);
402 free_tsk:
403         free_task_struct(tsk);
404         return NULL;
405 }
406 
407 #ifdef CONFIG_MMU
408 static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
409 {
410         struct vm_area_struct *mpnt, *tmp, *prev, **pprev;
411         struct rb_node **rb_link, *rb_parent;
412         int retval;
413         unsigned long charge;
414