copy_process()源码

版本2.6.23
摘自https://elixir.bootlin.com/linux/v2.6.23/source/kernel/fork.c#L959

文字描述

1.调用 dup_task_struct 复制当前的 task_struct
2.检查进程数是否超过限制
3.初始化自旋锁、挂起信号、CPU 定时器等
4.调用 sched_fork 初始化进程数据结构,并把进程状态设置为 TASK_RUNNING
5.复制所有进程信息,包括文件系统、信号处理函数、信号、内存管理等
6.调用 copy_thread 初始化子进程内核栈
7终止并返回子进程描述符指针
原文链接:https://blog.csdn.net/tiankong_/java/article/details/76190708

copy_process()源码 2.6.23

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
static struct task_struct *copy_process(unsigned long clone_flags,
unsigned long stack_start,
struct pt_regs *regs,
unsigned long stack_size,
int __user *parent_tidptr,
int __user *child_tidptr,
struct pid *pid){
int retval;
struct task_struct *p = NULL;

if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
return ERR_PTR(-EINVAL);

/*
* Thread groups must share signals as well, and detached threads
* can only be started up within the thread group.
*/
if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND))
return ERR_PTR(-EINVAL);

/*
* Shared signal handlers imply shared VM. By way of the above,
* thread groups also imply shared VM. Blocking this case allows
* for various simplifications in other code.
*/
if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))
return ERR_PTR(-EINVAL);

retval = security_task_create(clone_flags);
if (retval)
goto fork_out;

retval = -ENOMEM;
p = dup_task_struct(current);
if (!p)
goto fork_out;

rt_mutex_init_task(p);

#ifdef CONFIG_TRACE_IRQFLAGS
DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
#endif
retval = -EAGAIN;
if (atomic_read(&p->user->processes) >=
p->signal->rlim[RLIMIT_NPROC].rlim_cur) {
if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) &&
p->user != current->nsproxy->user_ns->root_user)
goto bad_fork_free;
}

atomic_inc(&p->user->__count);
atomic_inc(&p->user->processes);
get_group_info(p->group_info);

/*
* If multiple threads are within copy_process(), then this check
* triggers too late. This doesn't hurt, the check is only there
* to stop root fork bombs.
*/
if (nr_threads >= max_threads)
goto bad_fork_cleanup_count;

if (!try_module_get(task_thread_info(p)->exec_domain->module))
goto bad_fork_cleanup_count;

if (p->binfmt && !try_module_get(p->binfmt->module))
goto bad_fork_cleanup_put_domain;

p->did_exec = 0;
delayacct_tsk_init(p); /* Must remain after dup_task_struct() */
copy_flags(clone_flags, p);
p->pid = pid_nr(pid);
retval = -EFAULT;
if (clone_flags & CLONE_PARENT_SETTID)
if (put_user(p->pid, parent_tidptr))
goto bad_fork_cleanup_delays_binfmt;

INIT_LIST_HEAD(&p->children);
INIT_LIST_HEAD(&p->sibling);
p->vfork_done = NULL;
spin_lock_init(&p->alloc_lock);

clear_tsk_thread_flag(p, TIF_SIGPENDING);
init_sigpending(&p->pending);

p->utime = cputime_zero;
p->stime = cputime_zero;

#ifdef CONFIG_TASK_XACCT
p->rchar = 0; /* I/O counter: bytes read */
p->wchar = 0; /* I/O counter: bytes written */
p->syscr = 0; /* I/O counter: read syscalls */
p->syscw = 0; /* I/O counter: write syscalls */
#endif
task_io_accounting_init(p);
acct_clear_integrals(p);

p->it_virt_expires = cputime_zero;
p->it_prof_expires = cputime_zero;
p->it_sched_expires = 0;
INIT_LIST_HEAD(&p->cpu_timers[0]);
INIT_LIST_HEAD(&p->cpu_timers[1]);
INIT_LIST_HEAD(&p->cpu_timers[2]);

p->lock_depth = -1; /* -1 = no lock */
do_posix_clock_monotonic_gettime(&p->start_time);
p->real_start_time = p->start_time;
monotonic_to_bootbased(&p->real_start_time);
p->security = NULL;
p->io_context = NULL;
p->io_wait = NULL;
p->audit_context = NULL;
cpuset_fork(p);
#ifdef CONFIG_NUMA
p->mempolicy = mpol_copy(p->mempolicy);
if (IS_ERR(p->mempolicy)) {
retval = PTR_ERR(p->mempolicy);
p->mempolicy = NULL;
goto bad_fork_cleanup_cpuset;
}
mpol_fix_fork_child_flag(p);
#endif
#ifdef CONFIG_TRACE_IRQFLAGS
p->irq_events = 0;
#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
p->hardirqs_enabled = 1;
#else
p->hardirqs_enabled = 0;
#endif
p->hardirq_enable_ip = 0;
p->hardirq_enable_event = 0;
p->hardirq_disable_ip = _THIS_IP_;
p->hardirq_disable_event = 0;
p->softirqs_enabled = 1;
p->softirq_enable_ip = _THIS_IP_;
p->softirq_enable_event = 0;
p->softirq_disable_ip = 0;
p->softirq_disable_event = 0;
p->hardirq_context = 0;
p->softirq_context = 0;
#endif
#ifdef CONFIG_LOCKDEP
p->lockdep_depth = 0; /* no locks held yet */
p->curr_chain_key = 0;
p->lockdep_recursion = 0;
#endif

#ifdef CONFIG_DEBUG_MUTEXES
p->blocked_on = NULL; /* not blocked yet */
#endif

p->tgid = p->pid;
if (clone_flags & CLONE_THREAD)
p->tgid = current->tgid;

if ((retval = security_task_alloc(p)))
goto bad_fork_cleanup_policy;
if ((retval = audit_alloc(p)))
goto bad_fork_cleanup_security;
/* copy all the process information */
if ((retval = copy_semundo(clone_flags, p)))
goto bad_fork_cleanup_audit;
if ((retval = copy_files(clone_flags, p)))
goto bad_fork_cleanup_semundo;
if ((retval = copy_fs(clone_flags, p)))
goto bad_fork_cleanup_files;
if ((retval = copy_sighand(clone_flags, p)))
goto bad_fork_cleanup_fs;
if ((retval = copy_signal(clone_flags, p)))
goto bad_fork_cleanup_sighand;
if ((retval = copy_mm(clone_flags, p)))
goto bad_fork_cleanup_signal;
if ((retval = copy_keys(clone_flags, p)))
goto bad_fork_cleanup_mm;
if ((retval = copy_namespaces(clone_flags, p)))
goto bad_fork_cleanup_keys;
retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);
if (retval)
goto bad_fork_cleanup_namespaces;

p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
/*
* Clear TID on mm_release()?
*/
p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL;
p->robust_list = NULL;
#ifdef CONFIG_COMPAT
p->compat_robust_list = NULL;
#endif
INIT_LIST_HEAD(&p->pi_state_list);
p->pi_state_cache = NULL;

/*
* sigaltstack should be cleared when sharing the same VM
*/
if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM)
p->sas_ss_sp = p->sas_ss_size = 0;

/*
* Syscall tracing should be turned off in the child regardless
* of CLONE_PTRACE.
*/
clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE);
#ifdef TIF_SYSCALL_EMU
clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);
#endif

/* Our parent execution domain becomes current domain
These must match for thread signalling to apply */
p->parent_exec_id = p->self_exec_id;

/* ok, now we should be set up.. */
p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL);
p->pdeath_signal = 0;
p->exit_state = 0;

/*
* Ok, make it visible to the rest of the system.
* We dont wake it up yet.
*/
p->group_leader = p;
INIT_LIST_HEAD(&p->thread_group);
INIT_LIST_HEAD(&p->ptrace_children);
INIT_LIST_HEAD(&p->ptrace_list);

/* Perform scheduler related setup. Assign this task to a CPU. */
sched_fork(p, clone_flags);

/* Need tasklist lock for parent etc handling! */
write_lock_irq(&tasklist_lock);

/* for sys_ioprio_set(IOPRIO_WHO_PGRP) */
p->ioprio = current->ioprio;

/*
* The task hasn't been attached yet, so its cpus_allowed mask will
* not be changed, nor will its assigned CPU.
*
* The cpus_allowed mask of the parent may have changed after it was
* copied first time - so re-copy it here, then check the child's CPU
* to ensure it is on a valid CPU (and if not, just force it back to
* parent's CPU). This avoids alot of nasty races.
*/
p->cpus_allowed = current->cpus_allowed;
if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) ||
!cpu_online(task_cpu(p))))
set_task_cpu(p, smp_processor_id());

/* CLONE_PARENT re-uses the old parent */
if (clone_flags & (CLONE_PARENT|CLONE_THREAD))
p->real_parent = current->real_parent;
else
p->real_parent = current;
p->parent = p->real_parent;

spin_lock(&current->sighand->siglock);

/*
* Process group and session signals need to be delivered to just the
* parent before the fork or both the parent and the child after the
* fork. Restart if a signal comes in before we add the new process to
* it's process group.
* A fatal signal pending means that current will exit, so the new
* thread can't slip out of an OOM kill (or normal SIGKILL).
*/
recalc_sigpending();
if (signal_pending(current)) {
spin_unlock(&current->sighand->siglock);
write_unlock_irq(&tasklist_lock);
retval = -ERESTARTNOINTR;
goto bad_fork_cleanup_namespaces;
}

if (clone_flags & CLONE_THREAD) {
p->group_leader = current->group_leader;
list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group);

if (!cputime_eq(current->signal->it_virt_expires,
cputime_zero) ||
!cputime_eq(current->signal->it_prof_expires,
cputime_zero) ||
current->signal->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY ||
!list_empty(&current->signal->cpu_timers[0]) ||
!list_empty(&current->signal->cpu_timers[1]) ||
!list_empty(&current->signal->cpu_timers[2])) {
/*
* Have child wake up on its first tick to check
* for process CPU timers.
*/
p->it_prof_expires = jiffies_to_cputime(1);
}
}

if (likely(p->pid)) {
add_parent(p);
if (unlikely(p->ptrace & PT_PTRACED))
__ptrace_link(p, current->parent);

if (thread_group_leader(p)) {
p->signal->tty = current->signal->tty;
p->signal->pgrp = process_group(current);
set_signal_session(p->signal, process_session(current));
attach_pid(p, PIDTYPE_PGID, task_pgrp(current));
attach_pid(p, PIDTYPE_SID, task_session(current));

list_add_tail_rcu(&p->tasks, &init_task.tasks);
__get_cpu_var(process_counts)++;
}
attach_pid(p, PIDTYPE_PID, pid);
nr_threads++;
}

total_forks++;
spin_unlock(&current->sighand->siglock);
write_unlock_irq(&tasklist_lock);
proc_fork_connector(p);
return p;

bad_fork_cleanup_namespaces:
exit_task_namespaces(p);
bad_fork_cleanup_keys:
exit_keys(p);
bad_fork_cleanup_mm:
if (p->mm)
mmput(p->mm);
bad_fork_cleanup_signal:
cleanup_signal(p);
bad_fork_cleanup_sighand:
__cleanup_sighand(p->sighand);
bad_fork_cleanup_fs:
exit_fs(p); /* blocking */
bad_fork_cleanup_files:
exit_files(p); /* blocking */
bad_fork_cleanup_semundo:
exit_sem(p);
bad_fork_cleanup_audit:
audit_free(p);
bad_fork_cleanup_security:
security_task_free(p);
bad_fork_cleanup_policy:
#ifdef CONFIG_NUMA
mpol_free(p->mempolicy);
bad_fork_cleanup_cpuset:
#endif
cpuset_exit(p);
bad_fork_cleanup_delays_binfmt:
delayacct_tsk_free(p);
if (p->binfmt)
module_put(p->binfmt->module);
bad_fork_cleanup_put_domain:
module_put(task_thread_info(p)->exec_domain->module);
bad_fork_cleanup_count:
put_group_info(p->group_info);
atomic_dec(&p->user->processes);
free_uid(p->user);
bad_fork_free:
free_task(p);
fork_out:
return ERR_PTR(retval);
}

task_struct源码 2.6.23

task_struct 定义

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
struct task_struct {
volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */
void *stack;
atomic_t usage;
unsigned int flags; /* per process flags, defined below */
unsigned int ptrace;

int lock_depth; /* BKL lock depth */

#ifdef CONFIG_SMP
#ifdef __ARCH_WANT_UNLOCKED_CTXSW
int oncpu;
#endif
#endif

int prio, static_prio, normal_prio;
struct list_head run_list;
struct sched_class *sched_class;
struct sched_entity se;

#ifdef CONFIG_PREEMPT_NOTIFIERS
/* list of struct preempt_notifier: */
struct hlist_head preempt_notifiers;
#endif

unsigned short ioprio;
#ifdef CONFIG_BLK_DEV_IO_TRACE
unsigned int btrace_seq;
#endif

unsigned int policy;
cpumask_t cpus_allowed;
unsigned int time_slice;

#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
struct sched_info sched_info;
#endif

struct list_head tasks;
/*
* ptrace_list/ptrace_children forms the list of my children
* that were stolen by a ptracer.
*/
struct list_head ptrace_children;
struct list_head ptrace_list;

struct mm_struct *mm, *active_mm;

/* task state */
struct linux_binfmt *binfmt;
int exit_state;
int exit_code, exit_signal;
int pdeath_signal; /* The signal sent when the parent dies */
/* ??? */
unsigned int personality;
unsigned did_exec:1;
pid_t pid;
pid_t tgid;

#ifdef CONFIG_CC_STACKPROTECTOR
/* Canary value for the -fstack-protector gcc feature */
unsigned long stack_canary;
#endif
/*
* pointers to (original) parent process, youngest child, younger sibling,
* older sibling, respectively. (p->father can be replaced with
* p->parent->pid)
*/
struct task_struct *real_parent; /* real parent process (when being debugged) */
struct task_struct *parent; /* parent process */
/*
* children/sibling forms the list of my children plus the
* tasks I'm ptracing.
*/
struct list_head children; /* list of my children */
struct list_head sibling; /* linkage in my parent's children list */
struct task_struct *group_leader; /* threadgroup leader */

/* PID/PID hash table linkage. */
struct pid_link pids[PIDTYPE_MAX];
struct list_head thread_group;

struct completion *vfork_done; /* for vfork() */
int __user *set_child_tid; /* CLONE_CHILD_SETTID */
int __user *clear_child_tid; /* CLONE_CHILD_CLEARTID */

unsigned int rt_priority;
cputime_t utime, stime;
unsigned long nvcsw, nivcsw; /* context switch counts */
struct timespec start_time; /* monotonic time */
struct timespec real_start_time; /* boot based time */
/* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
unsigned long min_flt, maj_flt;

cputime_t it_prof_expires, it_virt_expires;
unsigned long long it_sched_expires;
struct list_head cpu_timers[3];

/* process credentials */
uid_t uid,euid,suid,fsuid;
gid_t gid,egid,sgid,fsgid;
struct group_info *group_info;
kernel_cap_t cap_effective, cap_inheritable, cap_permitted;
unsigned keep_capabilities:1;
struct user_struct *user;
#ifdef CONFIG_KEYS
struct key *request_key_auth; /* assumed request_key authority */
struct key *thread_keyring; /* keyring private to this thread */
unsigned char jit_keyring; /* default keyring to attach requested keys to */
#endif
/*
* fpu_counter contains the number of consecutive context switches
* that the FPU is used. If this is over a threshold, the lazy fpu
* saving becomes unlazy to save the trap. This is an unsigned char
* so that after 256 times the counter wraps and the behavior turns
* lazy again; this to deal with bursty apps that only use FPU for
* a short time
*/
unsigned char fpu_counter;
int oomkilladj; /* OOM kill score adjustment (bit shift). */
char comm[TASK_COMM_LEN]; /* executable name excluding path
- access with [gs]et_task_comm (which lock
it with task_lock())
- initialized normally by flush_old_exec */
/* file system info */
int link_count, total_link_count;
#ifdef CONFIG_SYSVIPC
/* ipc stuff */
struct sysv_sem sysvsem;
#endif
/* CPU-specific state of this task */
struct thread_struct thread;
/* filesystem information */
struct fs_struct *fs;
/* open file information */
struct files_struct *files;
/* namespaces */
struct nsproxy *nsproxy;
/* signal handlers */
struct signal_struct *signal;
struct sighand_struct *sighand;

sigset_t blocked, real_blocked;
sigset_t saved_sigmask; /* To be restored with TIF_RESTORE_SIGMASK */
struct sigpending pending;

unsigned long sas_ss_sp;
size_t sas_ss_size;
int (*notifier)(void *priv);
void *notifier_data;
sigset_t *notifier_mask;

void *security;
struct audit_context *audit_context;
seccomp_t seccomp;

/* Thread group tracking */
u32 parent_exec_id;
u32 self_exec_id;
/* Protection of (de-)allocation: mm, files, fs, tty, keyrings */
spinlock_t alloc_lock;

/* Protection of the PI data structures: */
spinlock_t pi_lock;

#ifdef CONFIG_RT_MUTEXES
/* PI waiters blocked on a rt_mutex held by this task */
struct plist_head pi_waiters;
/* Deadlock detection and priority inheritance handling */
struct rt_mutex_waiter *pi_blocked_on;
#endif

#ifdef CONFIG_DEBUG_MUTEXES
/* mutex deadlock detection */
struct mutex_waiter *blocked_on;
#endif
#ifdef CONFIG_TRACE_IRQFLAGS
unsigned int irq_events;
int hardirqs_enabled;
unsigned long hardirq_enable_ip;
unsigned int hardirq_enable_event;
unsigned long hardirq_disable_ip;
unsigned int hardirq_disable_event;
int softirqs_enabled;
unsigned long softirq_disable_ip;
unsigned int softirq_disable_event;
unsigned long softirq_enable_ip;
unsigned int softirq_enable_event;
int hardirq_context;
int softirq_context;
#endif
#ifdef CONFIG_LOCKDEP
# define MAX_LOCK_DEPTH 30UL
u64 curr_chain_key;
int lockdep_depth;
struct held_lock held_locks[MAX_LOCK_DEPTH];
unsigned int lockdep_recursion;
#endif

/* journalling filesystem info */
void *journal_info;

/* stacked block device info */
struct bio *bio_list, **bio_tail;

/* VM state */
struct reclaim_state *reclaim_state;

struct backing_dev_info *backing_dev_info;

struct io_context *io_context;

unsigned long ptrace_message;
siginfo_t *last_siginfo; /* For ptrace use. */
/*
* current io wait handle: wait queue entry to use for io waits
* If this thread is processing aio, this points at the waitqueue
* inside the currently handled kiocb. It may be NULL (i.e. default
* to a stack based synchronous wait) if its doing sync IO.
*/
wait_queue_t *io_wait;
#ifdef CONFIG_TASK_XACCT
/* i/o counters(bytes read/written, #syscalls */
u64 rchar, wchar, syscr, syscw;
#endif
struct task_io_accounting ioac;
#if defined(CONFIG_TASK_XACCT)
u64 acct_rss_mem1; /* accumulated rss usage */
u64 acct_vm_mem1; /* accumulated virtual memory usage */
cputime_t acct_stimexpd;/* stime since last update */
#endif
#ifdef CONFIG_NUMA
struct mempolicy *mempolicy;
short il_next;
#endif
#ifdef CONFIG_CPUSETS
struct cpuset *cpuset;
nodemask_t mems_allowed;
int cpuset_mems_generation;
int cpuset_mem_spread_rotor;
#endif
struct robust_list_head __user *robust_list;
#ifdef CONFIG_COMPAT
struct compat_robust_list_head __user *compat_robust_list;
#endif
struct list_head pi_state_list;
struct futex_pi_state *pi_state_cache;

atomic_t fs_excl; /* holding fs exclusive resources */
struct rcu_head rcu;

/*
* cache last used pipe for splice
*/
struct pipe_inode_info *splice_pipe;
#ifdef CONFIG_TASK_DELAY_ACCT
struct task_delay_info *delays;
#endif
#ifdef CONFIG_FAULT_INJECTION
int make_it_fail;
#endif
};

pid 定义

1
2
3
4
5
6
7
8
9
10
struct pid
{
atomic_t count;
/* Try to keep pid_chain in the same cacheline as nr for find_pid */
int nr;
struct hlist_node pid_chain;
/* lists of tasks that use this pid */
struct hlist_head tasks[PIDTYPE_MAX];
struct rcu_head rcu;
};

copy_mm(),mm_struct有关

copy_process其中最重要的是 copy_mm() ,该函数通过建立新进程所有页表和内存描述符来创建进程地址空间;

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
struct mm_struct
{
struct vm_area_struct * mmap; // 指向线性区对象的链表头
struct rb_root mm_rb; // 指向线性区对象的红黑树的根
pgd_t * pgd; // 指向页全局目录
atomic_t mm_users; // 次使用计数器,存放共享 mm_struct 数据结构轻量级进程的个数
atomic_t mm_count; // 主使用计数器,每当 mm_count 递减,内核就要检查它是否为0,如果是就要解除这个内存描述符
}
static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
{
struct mm_struct * mm, *oldmm;
int retval;

tsk->min_flt = tsk->maj_flt = 0;
tsk->nvcsw = tsk->nivcsw = 0;

tsk->mm = NULL;
tsk->active_mm = NULL;

/*
* Are we cloning a kernel thread?
*
* We need to steal a active VM for that..
*/
oldmm = current->mm; //oldmm 初始化为父进程的 mm_struct
if (!oldmm)
return 0;

if (clone_flags & CLONE_VM) {
atomic_inc(&oldmm->mm_users); // 父进程的地址空间引用计数加一
mm = oldmm;// 将父进程地址空间赋给子进程
goto good_mm;
}

retval = -ENOMEM;
mm = dup_mm(tsk);
if (!mm)
goto fail_nomem;

good_mm:
/* Initializing for Swap token stuff */
mm->token_priority = 0;
mm->last_interval = 0;

tsk->mm = mm;
tsk->active_mm = mm;
return 0;

fail_nomem:
return retval;
}

dup_mm()

调用dup_mmap(mm, oldmm);

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
static struct mm_struct *dup_mm(struct task_struct *tsk)
{
struct mm_struct *mm, *oldmm = current->mm;
int err;

if (!oldmm)
return NULL;

mm = allocate_mm();
if (!mm)
goto fail_nomem;

memcpy(mm, oldmm, sizeof(*mm));

/* Initializing for Swap token stuff */
mm->token_priority = 0;
mm->last_interval = 0;

if (!mm_init(mm))
goto fail_nomem;

if (init_new_context(tsk, mm))
goto fail_nocontext;

err = dup_mmap(mm, oldmm);
if (err)
goto free_pt;

mm->hiwater_rss = get_mm_rss(mm);
mm->hiwater_vm = mm->total_vm;

return mm;

free_pt:
mmput(mm);

fail_nomem:
return NULL;

fail_nocontext:
/*
* If init_new_context() failed, we cannot use mmput() to free the mm
* because it calls destroy_context()
*/
mm_free_pgd(mm);
free_mm(mm);
return NULL;
}

dup_mmap()

复制父进程每个 vm_area_struct 线性区描述符,插入到子进程的线性区链表和红黑树中;

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
struct vm_area_struct
{
struct mm_struct * vm_mm; // 指向线性区所在的内存描述符
unsigned long vm_start; // 当前线性区起始地址
unsigned long vm_end; // 线性区尾地址
struct vm_area_struct * vm_next; // 下一个线性区
pgprot_t vm_page_prot; // 线性区访问权限
struct rb_node vm_rb; // 用于红黑树搜索的节点
}
static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
{
struct vm_area_struct *mpnt, *tmp, **pprev;
struct rb_node **rb_link, *rb_parent;
int retval;
unsigned long charge;
struct mempolicy *pol;

down_write(&oldmm->mmap_sem);
flush_cache_dup_mm(oldmm);
/*
* Not linked in yet - no deadlock potential:
*/
down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING);

mm->locked_vm = 0;
mm->mmap = NULL;
mm->mmap_cache = NULL;
mm->free_area_cache = oldmm->mmap_base;
mm->cached_hole_size = ~0UL;
mm->map_count = 0;
cpus_clear(mm->cpu_vm_mask);
mm->mm_rb = RB_ROOT;
rb_link = &mm->mm_rb.rb_node;
rb_parent = NULL;
pprev = &mm->mmap;

for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
struct file *file;

if (mpnt->vm_flags & VM_DONTCOPY) {
long pages = vma_pages(mpnt);
mm->total_vm -= pages;
vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file,
-pages);
continue;
}
charge = 0;
if (mpnt->vm_flags & VM_ACCOUNT) {
unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT;
if (security_vm_enough_memory(len))
goto fail_nomem;
charge = len;
}
tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
if (!tmp)
goto fail_nomem;
*tmp = *mpnt;
pol = mpol_copy(vma_policy(mpnt));
retval = PTR_ERR(pol);
if (IS_ERR(pol))
goto fail_nomem_policy;
vma_set_policy(tmp, pol);
tmp->vm_flags &= ~VM_LOCKED;
tmp->vm_mm = mm;
tmp->vm_next = NULL;
anon_vma_link(tmp);
file = tmp->vm_file;
if (file) {
struct inode *inode = file->f_path.dentry->d_inode;
get_file(file);
if (tmp->vm_flags & VM_DENYWRITE)
atomic_dec(&inode->i_writecount);

/* insert tmp into the share list, just after mpnt */
spin_lock(&file->f_mapping->i_mmap_lock);
tmp->vm_truncate_count = mpnt->vm_truncate_count;
flush_dcache_mmap_lock(file->f_mapping);
vma_prio_tree_add(tmp, mpnt);
flush_dcache_mmap_unlock(file->f_mapping);
spin_unlock(&file->f_mapping->i_mmap_lock);
}

/*
* Link in the new vma and copy the page table entries.
*/
*pprev = tmp;
pprev = &tmp->vm_next;

__vma_link_rb(mm, tmp, rb_link, rb_parent);
rb_link = &tmp->vm_rb.rb_right;
rb_parent = &tmp->vm_rb;

mm->map_count++;
retval = copy_page_range(mm, oldmm, mpnt);

if (tmp->vm_ops && tmp->vm_ops->open)
tmp->vm_ops->open(tmp);

if (retval)
goto out;
}
/* a new mm has just been created */
arch_dup_mmap(oldmm, mm);
retval = 0;
out:
up_write(&mm->mmap_sem);
flush_tlb_mm(oldmm);
up_write(&oldmm->mmap_sem);
return retval;
fail_nomem_policy:
kmem_cache_free(vm_area_cachep, tmp);
fail_nomem:
retval = -ENOMEM;
vm_unacct_memory(charge);
goto out;
}

引自https://www.cnblogs.com/chenxinshuo/p/11968329.html