一直以来,进程和线程的区别,这种问题一般会被面试官拿来考考面试者,可见这事就不太简单。简单说一点差异是,进程拥有独立的内存资源信息,而线程则共享父进程的资源信息。也就是说线程不拥有内存资源,所以对系统消耗会更小。所以,线程也有轻量级进程的说法。

  除了从资源消耗的角度来讲进程线程的差别,还有一个值得说明的是,内存的可见性问题。进程与进程间的资源是隔离的,所以要想共享变量之类的,是很难做到的,但是共享线程变量则容易许多,因为多线程共享内存资源,所以一个线程对内存的改变,其他线程是可以感知到的。这就给应用做一些共享和并发控制带来了很大的方便(其实也有进程间共享信息,进程间通信,只是太重了,应用层无法承受之重)。所以,一般的编程语言都会提供多线程的创建和使用方法。

  那么,进程线程是如何创建的呢?我们知道,所有上层语言,都会依赖于操作系统底层的基础实现,进线程的创建作为非常核心的概念,自然也是最终要转给操作系统处理。所以,我们想看看操作系统是如何创建进程线程的。

1. linux创建进程概述

  linux中进程和线程,拥有同样的数据结构,也就是说没有特别区分进程和线程的差别。这样做的好处是,进程和线程都是作为独立的调度单元对待,不需要特殊处理,减少了调度算法的复杂性。

  一般的,我们可以通过 fork() 函数直接创建进程,通过 pthread_create() 函数进行线程的创建,当然这些都是外层的api函数。我们可以以此二者作为入口进行进线程的创建研究。

  进线程作为进程调度的单元,它自然有一个对应的描述类,即 task_struct, 它描述了一个进程到底有哪些内容和能力,是一个非常重要的抽象。

// include/linux/sched.h
struct task_struct {
#ifdef CONFIG_THREAD_INFO_IN_TASK
/*
* For reasons of header soup (see current_thread_info()), this
* must be the first element of task_struct.
*/
struct thread_info thread_info;
#endif
/* -1 unrunnable, 0 runnable, >0 stopped: */
volatile long state; /*
* This begins the randomizable portion of task_struct. Only
* scheduling-critical items should be added above here.
*/
randomized_struct_fields_start void *stack;
atomic_t usage;
/* Per task flags (PF_*), defined further below: */
unsigned int flags;
unsigned int ptrace; #ifdef CONFIG_SMP
struct llist_node wake_entry;
int on_cpu;
#ifdef CONFIG_THREAD_INFO_IN_TASK
/* Current CPU: */
unsigned int cpu;
#endif
unsigned int wakee_flips;
unsigned long wakee_flip_decay_ts;
struct task_struct *last_wakee; /*
* recent_used_cpu is initially set as the last CPU used by a task
* that wakes affine another task. Waker/wakee relationships can
* push tasks around a CPU where each wakeup moves to the next one.
* Tracking a recently used CPU allows a quick search for a recently
* used CPU that may be idle.
*/
int recent_used_cpu;
int wake_cpu;
#endif
int on_rq; int prio;
int static_prio;
int normal_prio;
unsigned int rt_priority; const struct sched_class *sched_class;
struct sched_entity se;
struct sched_rt_entity rt;
#ifdef CONFIG_CGROUP_SCHED
struct task_group *sched_task_group;
#endif
struct sched_dl_entity dl; #ifdef CONFIG_PREEMPT_NOTIFIERS
/* List of struct preempt_notifier: */
struct hlist_head preempt_notifiers;
#endif #ifdef CONFIG_BLK_DEV_IO_TRACE
unsigned int btrace_seq;
#endif unsigned int policy;
int nr_cpus_allowed;
cpumask_t cpus_allowed; #ifdef CONFIG_PREEMPT_RCU
int rcu_read_lock_nesting;
union rcu_special rcu_read_unlock_special;
struct list_head rcu_node_entry;
struct rcu_node *rcu_blocked_node;
#endif /* #ifdef CONFIG_PREEMPT_RCU */ #ifdef CONFIG_TASKS_RCU
unsigned long rcu_tasks_nvcsw;
u8 rcu_tasks_holdout;
u8 rcu_tasks_idx;
int rcu_tasks_idle_cpu;
struct list_head rcu_tasks_holdout_list;
#endif /* #ifdef CONFIG_TASKS_RCU */ struct sched_info sched_info; struct list_head tasks;
#ifdef CONFIG_SMP
struct plist_node pushable_tasks;
struct rb_node pushable_dl_tasks;
#endif struct mm_struct *mm;
struct mm_struct *active_mm; /* Per-thread vma caching: */
struct vmacache vmacache; #ifdef SPLIT_RSS_COUNTING
struct task_rss_stat rss_stat;
#endif
int exit_state;
int exit_code;
int exit_signal;
/* The signal sent when the parent dies: */
int pdeath_signal;
/* JOBCTL_*, siglock protected: */
unsigned long jobctl; /* Used for emulating ABI behavior of previous Linux versions: */
unsigned int personality; /* Scheduler bits, serialized by scheduler locks: */
unsigned sched_reset_on_fork:1;
unsigned sched_contributes_to_load:1;
unsigned sched_migrated:1;
unsigned sched_remote_wakeup:1;
/* Force alignment to the next boundary: */
unsigned :0; /* Unserialized, strictly 'current' */ /* Bit to tell LSMs we're in execve(): */
unsigned in_execve:1;
unsigned in_iowait:1;
#ifndef TIF_RESTORE_SIGMASK
unsigned restore_sigmask:1;
#endif
#ifdef CONFIG_MEMCG
unsigned memcg_may_oom:1;
#ifndef CONFIG_SLOB
unsigned memcg_kmem_skip_account:1;
#endif
#endif
#ifdef CONFIG_COMPAT_BRK
unsigned brk_randomized:1;
#endif
#ifdef CONFIG_CGROUPS
/* disallow userland-initiated cgroup migration */
unsigned no_cgroup_migration:1;
#endif unsigned long atomic_flags; /* Flags requiring atomic access. */ struct restart_block restart_block; pid_t pid;
pid_t tgid; #ifdef CONFIG_STACKPROTECTOR
/* Canary value for the -fstack-protector GCC feature: */
unsigned long stack_canary;
#endif
/*
* Pointers to the (original) parent process, youngest child, younger sibling,
* older sibling, respectively. (p->father can be replaced with
* p->real_parent->pid)
*/ /* Real parent process: */
struct task_struct __rcu *real_parent; /* Recipient of SIGCHLD, wait4() reports: */
struct task_struct __rcu *parent; /*
* Children/sibling form the list of natural children:
*/
struct list_head children;
struct list_head sibling;
struct task_struct *group_leader; /*
* 'ptraced' is the list of tasks this task is using ptrace() on.
*
* This includes both natural children and PTRACE_ATTACH targets.
* 'ptrace_entry' is this task's link on the p->parent->ptraced list.
*/
struct list_head ptraced;
struct list_head ptrace_entry; /* PID/PID hash table linkage. */
struct pid_link pids[PIDTYPE_MAX];
struct list_head thread_group;
struct list_head thread_node; struct completion *vfork_done; /* CLONE_CHILD_SETTID: */
int __user *set_child_tid; /* CLONE_CHILD_CLEARTID: */
int __user *clear_child_tid; u64 utime;
u64 stime;
#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
u64 utimescaled;
u64 stimescaled;
#endif
u64 gtime;
struct prev_cputime prev_cputime;
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
struct vtime vtime;
#endif #ifdef CONFIG_NO_HZ_FULL
atomic_t tick_dep_mask;
#endif
/* Context switch counts: */
unsigned long nvcsw;
unsigned long nivcsw; /* Monotonic time in nsecs: */
u64 start_time; /* Boot based time in nsecs: */
u64 real_start_time; /* MM fault and swap info: this can arguably be seen as either mm-specific or thread-specific: */
unsigned long min_flt;
unsigned long maj_flt; #ifdef CONFIG_POSIX_TIMERS
struct task_cputime cputime_expires;
struct list_head cpu_timers[3];
#endif /* Process credentials: */ /* Tracer's credentials at attach: */
const struct cred __rcu *ptracer_cred; /* Objective and real subjective task credentials (COW): */
const struct cred __rcu *real_cred; /* Effective (overridable) subjective task credentials (COW): */
const struct cred __rcu *cred; /*
* executable name, excluding path.
*
* - normally initialized setup_new_exec()
* - access it with [gs]et_task_comm()
* - lock it with task_lock()
*/
char comm[TASK_COMM_LEN]; struct nameidata *nameidata; #ifdef CONFIG_SYSVIPC
struct sysv_sem sysvsem;
struct sysv_shm sysvshm;
#endif
#ifdef CONFIG_DETECT_HUNG_TASK
unsigned long last_switch_count;
#endif
/* Filesystem information: */
struct fs_struct *fs; /* Open file information: */
struct files_struct *files; /* Namespaces: */
struct nsproxy *nsproxy; /* Signal handlers: */
struct signal_struct *signal;
struct sighand_struct *sighand;
sigset_t blocked;
sigset_t real_blocked;
/* Restored if set_restore_sigmask() was used: */
sigset_t saved_sigmask;
struct sigpending pending;
unsigned long sas_ss_sp;
size_t sas_ss_size;
unsigned int sas_ss_flags; struct callback_head *task_works; struct audit_context *audit_context;
#ifdef CONFIG_AUDITSYSCALL
kuid_t loginuid;
unsigned int sessionid;
#endif
struct seccomp seccomp; /* Thread group tracking: */
u32 parent_exec_id;
u32 self_exec_id; /* Protection against (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed, mempolicy: */
spinlock_t alloc_lock; /* Protection of the PI data structures: */
raw_spinlock_t pi_lock; struct wake_q_node wake_q; #ifdef CONFIG_RT_MUTEXES
/* PI waiters blocked on a rt_mutex held by this task: */
struct rb_root_cached pi_waiters;
/* Updated under owner's pi_lock and rq lock */
struct task_struct *pi_top_task;
/* Deadlock detection and priority inheritance handling: */
struct rt_mutex_waiter *pi_blocked_on;
#endif #ifdef CONFIG_DEBUG_MUTEXES
/* Mutex deadlock detection: */
struct mutex_waiter *blocked_on;
#endif #ifdef CONFIG_TRACE_IRQFLAGS
unsigned int irq_events;
unsigned long hardirq_enable_ip;
unsigned long hardirq_disable_ip;
unsigned int hardirq_enable_event;
unsigned int hardirq_disable_event;
int hardirqs_enabled;
int hardirq_context;
unsigned long softirq_disable_ip;
unsigned long softirq_enable_ip;
unsigned int softirq_disable_event;
unsigned int softirq_enable_event;
int softirqs_enabled;
int softirq_context;
#endif #ifdef CONFIG_LOCKDEP
# define MAX_LOCK_DEPTH 48UL
u64 curr_chain_key;
int lockdep_depth;
unsigned int lockdep_recursion;
struct held_lock held_locks[MAX_LOCK_DEPTH];
#endif #ifdef CONFIG_UBSAN
unsigned int in_ubsan;
#endif /* Journalling filesystem info: */
void *journal_info; /* Stacked block device info: */
struct bio_list *bio_list; #ifdef CONFIG_BLOCK
/* Stack plugging: */
struct blk_plug *plug;
#endif /* VM state: */
struct reclaim_state *reclaim_state; struct backing_dev_info *backing_dev_info; struct io_context *io_context; /* Ptrace state: */
unsigned long ptrace_message;
siginfo_t *last_siginfo; struct task_io_accounting ioac;
#ifdef CONFIG_TASK_XACCT
/* Accumulated RSS usage: */
u64 acct_rss_mem1;
/* Accumulated virtual memory usage: */
u64 acct_vm_mem1;
/* stime + utime since last update: */
u64 acct_timexpd;
#endif
#ifdef CONFIG_CPUSETS
/* Protected by ->alloc_lock: */
nodemask_t mems_allowed;
/* Seqence number to catch updates: */
seqcount_t mems_allowed_seq;
int cpuset_mem_spread_rotor;
int cpuset_slab_spread_rotor;
#endif
#ifdef CONFIG_CGROUPS
/* Control Group info protected by css_set_lock: */
struct css_set __rcu *cgroups;
/* cg_list protected by css_set_lock and tsk->alloc_lock: */
struct list_head cg_list;
#endif
#ifdef CONFIG_INTEL_RDT
u32 closid;
u32 rmid;
#endif
#ifdef CONFIG_FUTEX
struct robust_list_head __user *robust_list;
#ifdef CONFIG_COMPAT
struct compat_robust_list_head __user *compat_robust_list;
#endif
struct list_head pi_state_list;
struct futex_pi_state *pi_state_cache;
#endif
#ifdef CONFIG_PERF_EVENTS
struct perf_event_context *perf_event_ctxp[perf_nr_task_contexts];
struct mutex perf_event_mutex;
struct list_head perf_event_list;
#endif
#ifdef CONFIG_DEBUG_PREEMPT
unsigned long preempt_disable_ip;
#endif
#ifdef CONFIG_NUMA
/* Protected by alloc_lock: */
struct mempolicy *mempolicy;
short il_prev;
short pref_node_fork;
#endif
#ifdef CONFIG_NUMA_BALANCING
int numa_scan_seq;
unsigned int numa_scan_period;
unsigned int numa_scan_period_max;
int numa_preferred_nid;
unsigned long numa_migrate_retry;
/* Migration stamp: */
u64 node_stamp;
u64 last_task_numa_placement;
u64 last_sum_exec_runtime;
struct callback_head numa_work; struct list_head numa_entry;
struct numa_group *numa_group; /*
* numa_faults is an array split into four regions:
* faults_memory, faults_cpu, faults_memory_buffer, faults_cpu_buffer
* in this precise order.
*
* faults_memory: Exponential decaying average of faults on a per-node
* basis. Scheduling placement decisions are made based on these
* counts. The values remain static for the duration of a PTE scan.
* faults_cpu: Track the nodes the process was running on when a NUMA
* hinting fault was incurred.
* faults_memory_buffer and faults_cpu_buffer: Record faults per node
* during the current scan window. When the scan completes, the counts
* in faults_memory and faults_cpu decay and these values are copied.
*/
unsigned long *numa_faults;
unsigned long total_numa_faults; /*
* numa_faults_locality tracks if faults recorded during the last
* scan window were remote/local or failed to migrate. The task scan
* period is adapted based on the locality of the faults with different
* weights depending on whether they were shared or private faults
*/
unsigned long numa_faults_locality[3]; unsigned long numa_pages_migrated;
#endif /* CONFIG_NUMA_BALANCING */ #ifdef CONFIG_RSEQ
struct rseq __user *rseq;
u32 rseq_len;
u32 rseq_sig;
/*
* RmW on rseq_event_mask must be performed atomically
* with respect to preemption.
*/
unsigned long rseq_event_mask;
#endif struct tlbflush_unmap_batch tlb_ubc; struct rcu_head rcu; /* Cache last used pipe for splice(): */
struct pipe_inode_info *splice_pipe; struct page_frag task_frag; #ifdef CONFIG_TASK_DELAY_ACCT
struct task_delay_info *delays;
#endif #ifdef CONFIG_FAULT_INJECTION
int make_it_fail;
unsigned int fail_nth;
#endif
/*
* When (nr_dirtied >= nr_dirtied_pause), it's time to call
* balance_dirty_pages() for a dirty throttling pause:
*/
int nr_dirtied;
int nr_dirtied_pause;
/* Start of a write-and-pause period: */
unsigned long dirty_paused_when; #ifdef CONFIG_LATENCYTOP
int latency_record_count;
struct latency_record latency_record[LT_SAVECOUNT];
#endif
/*
* Time slack values; these are used to round up poll() and
* select() etc timeout values. These are in nanoseconds.
*/
u64 timer_slack_ns;
u64 default_timer_slack_ns; #ifdef CONFIG_KASAN
unsigned int kasan_depth;
#endif #ifdef CONFIG_FUNCTION_GRAPH_TRACER
/* Index of current stored address in ret_stack: */
int curr_ret_stack; /* Stack of return addresses for return function tracing: */
struct ftrace_ret_stack *ret_stack; /* Timestamp for last schedule: */
unsigned long long ftrace_timestamp; /*
* Number of functions that haven't been traced
* because of depth overrun:
*/
atomic_t trace_overrun; /* Pause tracing: */
atomic_t tracing_graph_pause;
#endif #ifdef CONFIG_TRACING
/* State flags for use by tracers: */
unsigned long trace; /* Bitmask and counter of trace recursion: */
unsigned long trace_recursion;
#endif /* CONFIG_TRACING */ #ifdef CONFIG_KCOV
/* Coverage collection mode enabled for this task (0 if disabled): */
unsigned int kcov_mode; /* Size of the kcov_area: */
unsigned int kcov_size; /* Buffer for coverage collection: */
void *kcov_area; /* KCOV descriptor wired with this task or NULL: */
struct kcov *kcov;
#endif #ifdef CONFIG_MEMCG
struct mem_cgroup *memcg_in_oom;
gfp_t memcg_oom_gfp_mask;
int memcg_oom_order; /* Number of pages to reclaim on returning to userland: */
unsigned int memcg_nr_pages_over_high;
#endif #ifdef CONFIG_UPROBES
struct uprobe_task *utask;
#endif
#if defined(CONFIG_BCACHE) || defined(CONFIG_BCACHE_MODULE)
unsigned int sequential_io;
unsigned int sequential_io_avg;
#endif
#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
unsigned long task_state_change;
#endif
int pagefault_disabled;
#ifdef CONFIG_MMU
struct task_struct *oom_reaper_list;
#endif
#ifdef CONFIG_VMAP_STACK
struct vm_struct *stack_vm_area;
#endif
#ifdef CONFIG_THREAD_INFO_IN_TASK
/* A live task holds one reference: */
atomic_t stack_refcount;
#endif
#ifdef CONFIG_LIVEPATCH
int patch_state;
#endif
#ifdef CONFIG_SECURITY
/* Used by LSM modules for access restriction: */
void *security;
#endif /*
* New fields for task_struct should be added above here, so that
* they are included in the randomized portion of task_struct.
*/
randomized_struct_fields_end /* CPU-specific state of this task: */
struct thread_struct thread; /*
* WARNING: on x86, 'thread_struct' contains a variable-sized
* structure. It *MUST* be at the end of 'task_struct'.
*
* Do not put anything below here!
*/
};

  涉及到的数据结构很多,因为很多后续进程的操作,都要基于它进行,如:内存、io、cpu、调度器、文件等,自然任务艰巨。

2. 进程的创建 fork

  这里说的fork, 是应用层的api fork。用法简单: pid_t pid = fork(); 就可以得到新的进程了,而且新的进程已经自动运行。

  fork方法被调用一次,成功就会有两次返回;在父进程中返回一次,返回的是子进程的pid(非0)在子进程中返回一次,返回值为0 。

  这里的fork和系统级的fork 是差不多的,下面会有详细讲解。

3. 线程的创建 pthread_create

  线程的创建方法为 pthread_create(),在glibc 的源码实现源码: https://sourceware.org/git/?p=glibc.git;a=blob;f=nptl/pthread_create.c;h=308db65cd4c148f8a119ed9025af194946aa2c80;hb=HEAD

  首先,pthread_create() 函数是通过 versioned_symbol 进行导出使用的。而其内部实现函数为 __pthread_create_2_1(), 如下:

versioned_symbol (libpthread, __pthread_create_2_1, pthread_create, GLIBC_2_1);

  所以我们可以通过 __pthread_create_2_1 进行研究其他创建线程过程。

int
__pthread_create_2_1 (pthread_t *newthread, const pthread_attr_t *attr,
void *(*start_routine) (void *), void *arg)
{
STACK_VARIABLES; const struct pthread_attr *iattr = (struct pthread_attr *) attr;
struct pthread_attr default_attr;
bool free_cpuset = false;
bool c11 = (attr == ATTR_C11_THREAD);
if (iattr == NULL || c11)
{
lll_lock (__default_pthread_attr_lock, LLL_PRIVATE);
default_attr = __default_pthread_attr;
size_t cpusetsize = default_attr.cpusetsize;
if (cpusetsize > 0)
{
cpu_set_t *cpuset;
if (__glibc_likely (__libc_use_alloca (cpusetsize)))
cpuset = __alloca (cpusetsize);
else
{
cpuset = malloc (cpusetsize);
if (cpuset == NULL)
{
lll_unlock (__default_pthread_attr_lock, LLL_PRIVATE);
return ENOMEM;
}
free_cpuset = true;
}
memcpy (cpuset, default_attr.cpuset, cpusetsize);
default_attr.cpuset = cpuset;
}
lll_unlock (__default_pthread_attr_lock, LLL_PRIVATE);
iattr = &default_attr;
} struct pthread *pd = NULL;
int err = ALLOCATE_STACK (iattr, &pd);
int retval = 0; if (__glibc_unlikely (err != 0))
/* Something went wrong. Maybe a parameter of the attributes is
invalid or we could not allocate memory. Note we have to
translate error codes. */
{
retval = err == ENOMEM ? EAGAIN : err;
goto out;
} /* Initialize the TCB. All initializations with zero should be
performed in 'get_cached_stack'. This way we avoid doing this if
the stack freshly allocated with 'mmap'. */ #if TLS_TCB_AT_TP
/* Reference to the TCB itself. */
pd->header.self = pd; /* Self-reference for TLS. */
pd->header.tcb = pd;
#endif /* Store the address of the start routine and the parameter. Since
we do not start the function directly the stillborn thread will
get the information from its thread descriptor. */
pd->start_routine = start_routine;
pd->arg = arg;
pd->c11 = c11; /* Copy the thread attribute flags. */
struct pthread *self = THREAD_SELF;
pd->flags = ((iattr->flags & ~(ATTR_FLAG_SCHED_SET | ATTR_FLAG_POLICY_SET))
| (self->flags & (ATTR_FLAG_SCHED_SET | ATTR_FLAG_POLICY_SET))); /* Initialize the field for the ID of the thread which is waiting
for us. This is a self-reference in case the thread is created
detached. */
pd->joinid = iattr->flags & ATTR_FLAG_DETACHSTATE ? pd : NULL; /* The debug events are inherited from the parent. */
pd->eventbuf = self->eventbuf; /* Copy the parent's scheduling parameters. The flags will say what
is valid and what is not. */
pd->schedpolicy = self->schedpolicy;
pd->schedparam = self->schedparam; /* Copy the stack guard canary. */
#ifdef THREAD_COPY_STACK_GUARD
THREAD_COPY_STACK_GUARD (pd);
#endif /* Copy the pointer guard value. */
#ifdef THREAD_COPY_POINTER_GUARD
THREAD_COPY_POINTER_GUARD (pd);
#endif /* Setup tcbhead. */
tls_setup_tcbhead (pd); /* Verify the sysinfo bits were copied in allocate_stack if needed. */
#ifdef NEED_DL_SYSINFO
CHECK_THREAD_SYSINFO (pd);
#endif /* Inform start_thread (above) about cancellation state that might
translate into inherited signal state. */
pd->parent_cancelhandling = THREAD_GETMEM (THREAD_SELF, cancelhandling); /* Determine scheduling parameters for the thread. */
if (__builtin_expect ((iattr->flags & ATTR_FLAG_NOTINHERITSCHED) != 0, 0)
&& (iattr->flags & (ATTR_FLAG_SCHED_SET | ATTR_FLAG_POLICY_SET)) != 0)
{
/* Use the scheduling parameters the user provided. */
if (iattr->flags & ATTR_FLAG_POLICY_SET)
{
pd->schedpolicy = iattr->schedpolicy;
pd->flags |= ATTR_FLAG_POLICY_SET;
}
if (iattr->flags & ATTR_FLAG_SCHED_SET)
{
/* The values were validated in pthread_attr_setschedparam. */
pd->schedparam = iattr->schedparam;
pd->flags |= ATTR_FLAG_SCHED_SET;
} if ((pd->flags & (ATTR_FLAG_SCHED_SET | ATTR_FLAG_POLICY_SET))
!= (ATTR_FLAG_SCHED_SET | ATTR_FLAG_POLICY_SET))
collect_default_sched (pd);
} if (__glibc_unlikely (__nptl_nthreads == 1))
_IO_enable_locks (); /* Pass the descriptor to the caller. */
*newthread = (pthread_t) pd; LIBC_PROBE (pthread_create, 4, newthread, attr, start_routine, arg); /* One more thread. We cannot have the thread do this itself, since it
might exist but not have been scheduled yet by the time we've returned
and need to check the value to behave correctly. We must do it before
creating the thread, in case it does get scheduled first and then
might mistakenly think it was the only thread. In the failure case,
we momentarily store a false value; this doesn't matter because there
is no kosher thing a signal handler interrupting us right here can do
that cares whether the thread count is correct. */
atomic_increment (&__nptl_nthreads); /* Our local value of stopped_start and thread_ran can be accessed at
any time. The PD->stopped_start may only be accessed if we have
ownership of PD (see CONCURRENCY NOTES above). */
bool stopped_start = false; bool thread_ran = false; /* Start the thread. */
if (__glibc_unlikely (report_thread_creation (pd)))
{
stopped_start = true; /* We always create the thread stopped at startup so we can
notify the debugger. */
retval = create_thread (pd, iattr, &stopped_start,
STACK_VARIABLES_ARGS, &thread_ran);
if (retval == 0)
{
/* We retain ownership of PD until (a) (see CONCURRENCY NOTES
above). */ /* Assert stopped_start is true in both our local copy and the
PD copy. */
assert (stopped_start);
assert (pd->stopped_start); /* Now fill in the information about the new thread in
the newly created thread's data structure. We cannot let
the new thread do this since we don't know whether it was
already scheduled when we send the event. */
pd->eventbuf.eventnum = TD_CREATE;
pd->eventbuf.eventdata = pd; /* Enqueue the descriptor. */
do
pd->nextevent = __nptl_last_event;
while (atomic_compare_and_exchange_bool_acq (&__nptl_last_event,
pd, pd->nextevent)
!= 0); /* Now call the function which signals the event. See
CONCURRENCY NOTES for the nptl_db interface comments. */
__nptl_create_event ();
}
}
else
retval = create_thread (pd, iattr, &stopped_start,
STACK_VARIABLES_ARGS, &thread_ran); if (__glibc_unlikely (retval != 0))
{
if (thread_ran)
/* State (c) or (d) and we may not have PD ownership (see
CONCURRENCY NOTES above). We can assert that STOPPED_START
must have been true because thread creation didn't fail, but
thread attribute setting did. */
/* See bug 19511 which explains why doing nothing here is a
resource leak for a joinable thread. */
assert (stopped_start);
else
{
/* State (e) and we have ownership of PD (see CONCURRENCY
NOTES above). */ /* Oops, we lied for a second. */
atomic_decrement (&__nptl_nthreads); /* Perhaps a thread wants to change the IDs and is waiting for this
stillborn thread. */
if (__glibc_unlikely (atomic_exchange_acq (&pd->setxid_futex, 0)
== -2))
futex_wake (&pd->setxid_futex, 1, FUTEX_PRIVATE); /* Free the resources. */
__deallocate_stack (pd);
} /* We have to translate error codes. */
if (retval == ENOMEM)
retval = EAGAIN;
}
else
{
/* We don't know if we have PD ownership. Once we check the local
stopped_start we'll know if we're in state (a) or (b) (see
CONCURRENCY NOTES above). */
if (stopped_start)
/* State (a), we own PD. The thread blocked on this lock either
because we're doing TD_CREATE event reporting, or for some
other reason that create_thread chose. Now let it run
free. */
lll_unlock (pd->lock, LLL_PRIVATE); /* We now have for sure more than one thread. The main thread might
not yet have the flag set. No need to set the global variable
again if this is what we use. */
THREAD_SETMEM (THREAD_SELF, header.multiple_threads, 1);
} out:
if (__glibc_unlikely (free_cpuset))
free (default_attr.cpuset); return retval;
}

  以上主要是构建创建线程的必要信息,然后调用 create_thread() 进行线程的创建,而create_thread并非是内核的调用,我们需要再深入了解下。

// sysdeps/unix/sysv/linux/createThread.c
static int
create_thread (struct pthread *pd, const struct pthread_attr *attr,
bool *stopped_start, STACK_VARIABLES_PARMS, bool *thread_ran)
{
/* Determine whether the newly created threads has to be started
stopped since we have to set the scheduling parameters or set the
affinity. */
if (attr != NULL
&& (__glibc_unlikely (attr->cpuset != NULL)
|| __glibc_unlikely ((attr->flags & ATTR_FLAG_NOTINHERITSCHED) != 0)))
*stopped_start = true; pd->stopped_start = *stopped_start;
if (__glibc_unlikely (*stopped_start))
/* See CONCURRENCY NOTES in nptl/pthread_creat.c. */
lll_lock (pd->lock, LLL_PRIVATE); /* We rely heavily on various flags the CLONE function understands: CLONE_VM, CLONE_FS, CLONE_FILES
These flags select semantics with shared address space and
file descriptors according to what POSIX requires. CLONE_SIGHAND, CLONE_THREAD
This flag selects the POSIX signal semantics and various
other kinds of sharing (itimers, POSIX timers, etc.). CLONE_SETTLS
The sixth parameter to CLONE determines the TLS area for the
new thread. CLONE_PARENT_SETTID
The kernels writes the thread ID of the newly created thread
into the location pointed to by the fifth parameters to CLONE. Note that it would be semantically equivalent to use
CLONE_CHILD_SETTID but it is be more expensive in the kernel. CLONE_CHILD_CLEARTID
The kernels clears the thread ID of a thread that has called
sys_exit() in the location pointed to by the seventh parameter
to CLONE. The termination signal is chosen to be zero which means no signal
is sent. */
const int clone_flags = (CLONE_VM | CLONE_FS | CLONE_FILES | CLONE_SYSVSEM
| CLONE_SIGHAND | CLONE_THREAD
| CLONE_SETTLS | CLONE_PARENT_SETTID
| CLONE_CHILD_CLEARTID
| 0); TLS_DEFINE_INIT_TP (tp, pd);
// __clone 系统调用
if (__glibc_unlikely (ARCH_CLONE (&start_thread, STACK_VARIABLES_ARGS,
clone_flags, pd, &pd->tid, tp, &pd->tid)
== -1))
return errno; /* It's started now, so if we fail below, we'll have to cancel it
and let it clean itself up. */
*thread_ran = true; /* Now we have the possibility to set scheduling parameters etc. */
if (attr != NULL)
{
INTERNAL_SYSCALL_DECL (err);
int res; /* Set the affinity mask if necessary. */
if (attr->cpuset != NULL)
{
assert (*stopped_start); res = INTERNAL_SYSCALL (sched_setaffinity, err, 3, pd->tid,
attr->cpusetsize, attr->cpuset); if (__glibc_unlikely (INTERNAL_SYSCALL_ERROR_P (res, err)))
err_out:
{
/* The operation failed. We have to kill the thread.
We let the normal cancellation mechanism do the work. */ pid_t pid = __getpid ();
INTERNAL_SYSCALL_DECL (err2);
(void) INTERNAL_SYSCALL_CALL (tgkill, err2, pid, pd->tid,
SIGCANCEL); return INTERNAL_SYSCALL_ERRNO (res, err);
}
} /* Set the scheduling parameters. */
if ((attr->flags & ATTR_FLAG_NOTINHERITSCHED) != 0)
{
assert (*stopped_start); res = INTERNAL_SYSCALL (sched_setscheduler, err, 3, pd->tid,
pd->schedpolicy, &pd->schedparam); if (__glibc_unlikely (INTERNAL_SYSCALL_ERROR_P (res, err)))
goto err_out;
}
} return 0;
}

  可见最终是调用到了 __clone() 方法了,__clone 是一个api级的linux调用,此处将会根据标识,创建一个线程。

4. 系统创建进程 do_fork

  上面的fork()和pthread_create(), 其实都是上层的封装,在操作系统层面,创建进程和线程都是使用的 do_fork();

// kernel/fork.c
/* For compatibility with architectures that call do_fork directly rather than
* using the syscall entry points below. */
long do_fork(unsigned long clone_flags,
unsigned long stack_start,
unsigned long stack_size,
int __user *parent_tidptr,
int __user *child_tidptr)
{
// 调用统一的创建进程方法
return _do_fork(clone_flags, stack_start, stack_size,
parent_tidptr, child_tidptr, 0);
} /*
* Ok, this is the main fork-routine.
*
* It copies the process, and if successful kick-starts
* it and waits for it to finish using the VM if required.
*/
long _do_fork(unsigned long clone_flags,
unsigned long stack_start,
unsigned long stack_size,
int __user *parent_tidptr,
int __user *child_tidptr,
unsigned long tls)
{
struct completion vfork;
struct pid *pid;
struct task_struct *p;
int trace = 0;
long nr; /*
* Determine whether and which event to report to ptracer. When
* called from kernel_thread or CLONE_UNTRACED is explicitly
* requested, no event is reported; otherwise, report if the event
* for the type of forking is enabled.
*/
if (!(clone_flags & CLONE_UNTRACED)) {
if (clone_flags & CLONE_VFORK)
trace = PTRACE_EVENT_VFORK;
else if ((clone_flags & CSIGNAL) != SIGCHLD)
trace = PTRACE_EVENT_CLONE;
else
trace = PTRACE_EVENT_FORK; if (likely(!ptrace_event_enabled(current, trace)))
trace = 0;
}
// 复制当前进程上下文信息
p = copy_process(clone_flags, stack_start, stack_size,
child_tidptr, NULL, trace, tls, NUMA_NO_NODE);
add_latent_entropy(); if (IS_ERR(p))
return PTR_ERR(p); /*
* Do this prior waking up the new thread - the thread pointer
* might get invalid after that point, if the thread exits quickly.
*/
trace_sched_process_fork(current, p);
// 获取新创建的进程id
pid = get_task_pid(p, PIDTYPE_PID);
nr = pid_vnr(pid); if (clone_flags & CLONE_PARENT_SETTID)
put_user(nr, parent_tidptr); if (clone_flags & CLONE_VFORK) {
p->vfork_done = &vfork;
init_completion(&vfork);
get_task_struct(p);
}
// 将新进程放入调度队列,以便后续可以被执行
wake_up_new_task(p); /* forking complete and child started to run, tell ptracer */
if (unlikely(trace))
ptrace_event_pid(trace, pid); if (clone_flags & CLONE_VFORK) {
if (!wait_for_vfork_done(p, &vfork))
ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid);
}
// 进程id管理
put_pid(pid);
return nr;
}

  框架简单清晰,先复制当前进程,然后将其唤醒等待调度,然后返回进程id。所以重点是进程如何复制和唤醒,当然其中的标识位设置是很重要的,它决定如何创建进程。

4.1 进程都复制了些什么?

/*
* This creates a new process as a copy of the old one,
* but does not actually start it yet.
*
* It copies the registers, and all the appropriate
* parts of the process environment (as per the clone
* flags). The actual kick-off is left to the caller.
*/
static __latent_entropy struct task_struct *copy_process(
unsigned long clone_flags,
unsigned long stack_start,
unsigned long stack_size,
int __user *child_tidptr,
struct pid *pid,
int trace,
unsigned long tls,
int node)
{
int retval;
struct task_struct *p; /*
* Don't allow sharing the root directory with processes in a different
* namespace
*/
if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
return ERR_PTR(-EINVAL); if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS))
return ERR_PTR(-EINVAL); /*
* Thread groups must share signals as well, and detached threads
* can only be started up within the thread group.
*/
if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND))
return ERR_PTR(-EINVAL); /*
* Shared signal handlers imply shared VM. By way of the above,
* thread groups also imply shared VM. Blocking this case allows
* for various simplifications in other code.
*/
if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))
return ERR_PTR(-EINVAL); /*
* Siblings of global init remain as zombies on exit since they are
* not reaped by their parent (swapper). To solve this and to avoid
* multi-rooted process trees, prevent global and container-inits
* from creating siblings.
*/
if ((clone_flags & CLONE_PARENT) &&
current->signal->flags & SIGNAL_UNKILLABLE)
return ERR_PTR(-EINVAL); /*
* If the new process will be in a different pid or user namespace
* do not allow it to share a thread group with the forking task.
*/
if (clone_flags & CLONE_THREAD) {
if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) ||
(task_active_pid_ns(current) !=
current->nsproxy->pid_ns_for_children))
return ERR_PTR(-EINVAL);
} retval = -ENOMEM;
// 复制当前进程的 task_struct 结构体
p = dup_task_struct(current, node);
if (!p)
goto fork_out; /*
* This _must_ happen before we call free_task(), i.e. before we jump
* to any of the bad_fork_* labels. This is to avoid freeing
* p->set_child_tid which is (ab)used as a kthread's data pointer for
* kernel threads (PF_KTHREAD).
*/
p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
/*
* Clear TID on mm_release()?
*/
p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL; ftrace_graph_init_task(p); rt_mutex_init_task(p); #ifdef CONFIG_PROVE_LOCKING
DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
#endif
retval = -EAGAIN;
if (atomic_read(&p->real_cred->user->processes) >=
task_rlimit(p, RLIMIT_NPROC)) {
if (p->real_cred->user != INIT_USER &&
!capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN))
goto bad_fork_free;
}
current->flags &= ~PF_NPROC_EXCEEDED; retval = copy_creds(p, clone_flags);
if (retval < 0)
goto bad_fork_free; /*
* If multiple threads are within copy_process(), then this check
* triggers too late. This doesn't hurt, the check is only there
* to stop root fork bombs.
*/
retval = -EAGAIN;
if (nr_threads >= max_threads)
goto bad_fork_cleanup_count; delayacct_tsk_init(p); /* Must remain after dup_task_struct() */
p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER | PF_IDLE);
p->flags |= PF_FORKNOEXEC;
INIT_LIST_HEAD(&p->children);
INIT_LIST_HEAD(&p->sibling);
rcu_copy_process(p);
p->vfork_done = NULL;
spin_lock_init(&p->alloc_lock); init_sigpending(&p->pending); p->utime = p->stime = p->gtime = 0;
#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
p->utimescaled = p->stimescaled = 0;
#endif
prev_cputime_init(&p->prev_cputime); #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
seqcount_init(&p->vtime.seqcount);
p->vtime.starttime = 0;
p->vtime.state = VTIME_INACTIVE;
#endif #if defined(SPLIT_RSS_COUNTING)
memset(&p->rss_stat, 0, sizeof(p->rss_stat));
#endif p->default_timer_slack_ns = current->timer_slack_ns; task_io_accounting_init(&p->ioac);
acct_clear_integrals(p); posix_cpu_timers_init(p); p->start_time = ktime_get_ns();
p->real_start_time = ktime_get_boot_ns();
p->io_context = NULL;
audit_set_context(p, NULL);
cgroup_fork(p);
#ifdef CONFIG_NUMA
p->mempolicy = mpol_dup(p->mempolicy);
if (IS_ERR(p->mempolicy)) {
retval = PTR_ERR(p->mempolicy);
p->mempolicy = NULL;
goto bad_fork_cleanup_threadgroup_lock;
}
#endif
#ifdef CONFIG_CPUSETS
p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
p->cpuset_slab_spread_rotor = NUMA_NO_NODE;
seqcount_init(&p->mems_allowed_seq);
#endif
#ifdef CONFIG_TRACE_IRQFLAGS
p->irq_events = 0;
p->hardirqs_enabled = 0;
p->hardirq_enable_ip = 0;
p->hardirq_enable_event = 0;
p->hardirq_disable_ip = _THIS_IP_;
p->hardirq_disable_event = 0;
p->softirqs_enabled = 1;
p->softirq_enable_ip = _THIS_IP_;
p->softirq_enable_event = 0;
p->softirq_disable_ip = 0;
p->softirq_disable_event = 0;
p->hardirq_context = 0;
p->softirq_context = 0;
#endif p->pagefault_disabled = 0; #ifdef CONFIG_LOCKDEP
p->lockdep_depth = 0; /* no locks held yet */
p->curr_chain_key = 0;
p->lockdep_recursion = 0;
lockdep_init_task(p);
#endif #ifdef CONFIG_DEBUG_MUTEXES
p->blocked_on = NULL; /* not blocked yet */
#endif
#ifdef CONFIG_BCACHE
p->sequential_io = 0;
p->sequential_io_avg = 0;
#endif // 复制其他独立信息到新进程中
/* Perform scheduler related setup. Assign this task to a CPU. */
// 设置调度器,及cpu分配
retval = sched_fork(clone_flags, p);
if (retval)
goto bad_fork_cleanup_policy; retval = perf_event_init_task(p);
if (retval)
goto bad_fork_cleanup_policy;
retval = audit_alloc(p);
if (retval)
goto bad_fork_cleanup_perf;
/* copy all the process information */
shm_init_task(p);
retval = security_task_alloc(p, clone_flags);
if (retval)
goto bad_fork_cleanup_audit;
retval = copy_semundo(clone_flags, p);
if (retval)
goto bad_fork_cleanup_security;
// 文件复制,实际上是 fd 的复制
retval = copy_files(clone_flags, p);
if (retval)
goto bad_fork_cleanup_semundo;
// 复制fs结构
retval = copy_fs(clone_flags, p);
if (retval)
goto bad_fork_cleanup_files;
retval = copy_sighand(clone_flags, p);
if (retval)
goto bad_fork_cleanup_fs;
// 复制信号监听,如果是线程则不需要
retval = copy_signal(clone_flags, p);
if (retval)
goto bad_fork_cleanup_sighand;
// 复制内存,重量级操作,创建线程时则不会真的复制
retval = copy_mm(clone_flags, p);
if (retval)
goto bad_fork_cleanup_signal;
// 复制各大命名空间, pid,uts,ipc,net,cgroup
retval = copy_namespaces(clone_flags, p);
if (retval)
goto bad_fork_cleanup_mm;
retval = copy_io(clone_flags, p);
if (retval)
goto bad_fork_cleanup_namespaces;
// 复制线程本地存储(thread local storage)
retval = copy_thread_tls(clone_flags, stack_start, stack_size, p, tls);
if (retval)
goto bad_fork_cleanup_io; if (pid != &init_struct_pid) {
pid = alloc_pid(p->nsproxy->pid_ns_for_children);
if (IS_ERR(pid)) {
retval = PTR_ERR(pid);
goto bad_fork_cleanup_thread;
}
} #ifdef CONFIG_BLOCK
p->plug = NULL;
#endif
#ifdef CONFIG_FUTEX
p->robust_list = NULL;
#ifdef CONFIG_COMPAT
p->compat_robust_list = NULL;
#endif
INIT_LIST_HEAD(&p->pi_state_list);
p->pi_state_cache = NULL;
#endif
/*
* sigaltstack should be cleared when sharing the same VM
*/
if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM)
sas_ss_reset(p); /*
* Syscall tracing and stepping should be turned off in the
* child regardless of CLONE_PTRACE.
*/
user_disable_single_step(p);
clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE);
#ifdef TIF_SYSCALL_EMU
clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);
#endif
clear_all_latency_tracing(p); /* ok, now we should be set up.. */
p->pid = pid_nr(pid);
if (clone_flags & CLONE_THREAD) {
p->exit_signal = -1;
p->group_leader = current->group_leader;
p->tgid = current->tgid;
} else {
if (clone_flags & CLONE_PARENT)
p->exit_signal = current->group_leader->exit_signal;
else
p->exit_signal = (clone_flags & CSIGNAL);
p->group_leader = p;
p->tgid = p->pid;
} p->nr_dirtied = 0;
p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10);
p->dirty_paused_when = 0; p->pdeath_signal = 0;
INIT_LIST_HEAD(&p->thread_group);
p->task_works = NULL; cgroup_threadgroup_change_begin(current);
/*
* Ensure that the cgroup subsystem policies allow the new process to be
* forked. It should be noted the the new process's css_set can be changed
* between here and cgroup_post_fork() if an organisation operation is in
* progress.
*/
retval = cgroup_can_fork(p);
if (retval)
goto bad_fork_free_pid; /*
* Make it visible to the rest of the system, but dont wake it up yet.
* Need tasklist lock for parent etc handling!
*/
write_lock_irq(&tasklist_lock); /* CLONE_PARENT re-uses the old parent */
if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
p->real_parent = current->real_parent;
p->parent_exec_id = current->parent_exec_id;
} else {
p->real_parent = current;
p->parent_exec_id = current->self_exec_id;
} klp_copy_process(p); spin_lock(&current->sighand->siglock); /*
* Copy seccomp details explicitly here, in case they were changed
* before holding sighand lock.
*/
copy_seccomp(p); rseq_fork(p, clone_flags); /*
* Process group and session signals need to be delivered to just the
* parent before the fork or both the parent and the child after the
* fork. Restart if a signal comes in before we add the new process to
* it's process group.
* A fatal signal pending means that current will exit, so the new
* thread can't slip out of an OOM kill (or normal SIGKILL).
*/
recalc_sigpending();
if (signal_pending(current)) {
retval = -ERESTARTNOINTR;
goto bad_fork_cancel_cgroup;
}
if (unlikely(!(ns_of_pid(pid)->pid_allocated & PIDNS_ADDING))) {
retval = -ENOMEM;
goto bad_fork_cancel_cgroup;
} if (likely(p->pid)) {
ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace); init_task_pid(p, PIDTYPE_PID, pid);
if (thread_group_leader(p)) {
init_task_pid(p, PIDTYPE_PGID, task_pgrp(current));
init_task_pid(p, PIDTYPE_SID, task_session(current)); if (is_child_reaper(pid)) {
ns_of_pid(pid)->child_reaper = p;
p->signal->flags |= SIGNAL_UNKILLABLE;
} p->signal->leader_pid = pid;
p->signal->tty = tty_kref_get(current->signal->tty);
/*
* Inherit has_child_subreaper flag under the same
* tasklist_lock with adding child to the process tree
* for propagate_has_child_subreaper optimization.
*/
p->signal->has_child_subreaper = p->real_parent->signal->has_child_subreaper ||
p->real_parent->signal->is_child_subreaper;
list_add_tail(&p->sibling, &p->real_parent->children);
list_add_tail_rcu(&p->tasks, &init_task.tasks);
attach_pid(p, PIDTYPE_PGID);
attach_pid(p, PIDTYPE_SID);
__this_cpu_inc(process_counts);
} else {
current->signal->nr_threads++;
atomic_inc(&current->signal->live);
atomic_inc(&current->signal->sigcnt);
list_add_tail_rcu(&p->thread_group,
&p->group_leader->thread_group);
list_add_tail_rcu(&p->thread_node,
&p->signal->thread_head);
}
attach_pid(p, PIDTYPE_PID);
nr_threads++;
} total_forks++;
spin_unlock(&current->sighand->siglock);
syscall_tracepoint_update(p);
write_unlock_irq(&tasklist_lock); proc_fork_connector(p);
cgroup_post_fork(p);
cgroup_threadgroup_change_end(current);
perf_event_fork(p); trace_task_newtask(p, clone_flags);
uprobe_copy_process(p, clone_flags); return p; bad_fork_cancel_cgroup:
spin_unlock(&current->sighand->siglock);
write_unlock_irq(&tasklist_lock);
cgroup_cancel_fork(p);
bad_fork_free_pid:
cgroup_threadgroup_change_end(current);
if (pid != &init_struct_pid)
free_pid(pid);
bad_fork_cleanup_thread:
exit_thread(p);
bad_fork_cleanup_io:
if (p->io_context)
exit_io_context(p);
bad_fork_cleanup_namespaces:
exit_task_namespaces(p);
bad_fork_cleanup_mm:
if (p->mm)
mmput(p->mm);
bad_fork_cleanup_signal:
if (!(clone_flags & CLONE_THREAD))
free_signal_struct(p->signal);
bad_fork_cleanup_sighand:
__cleanup_sighand(p->sighand);
bad_fork_cleanup_fs:
exit_fs(p); /* blocking */
bad_fork_cleanup_files:
exit_files(p); /* blocking */
bad_fork_cleanup_semundo:
exit_sem(p);
bad_fork_cleanup_security:
security_task_free(p);
bad_fork_cleanup_audit:
audit_free(p);
bad_fork_cleanup_perf:
perf_event_free_task(p);
bad_fork_cleanup_policy:
lockdep_free_task(p);
#ifdef CONFIG_NUMA
mpol_put(p->mempolicy);
bad_fork_cleanup_threadgroup_lock:
#endif
delayacct_tsk_free(p);
bad_fork_cleanup_count:
atomic_dec(&p->cred->user->processes);
exit_creds(p);
bad_fork_free:
p->state = TASK_DEAD;
put_task_stack(p);
free_task(p);
fork_out:
return ERR_PTR(retval);
}

  可以看出创建一个新进程,还是一件非常重的事,copy了很多的信息,可以说肯定会非常耗性能和资源,所以不要随便创建不必要的进程,它是宝贵的。下面我们就几个简单的复制过程了解下,都干了什么。

// 复制线程结构外壳
static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
{
struct task_struct *tsk;
unsigned long *stack;
struct vm_struct *stack_vm_area;
int err; if (node == NUMA_NO_NODE)
node = tsk_fork_get_node(orig);
tsk = alloc_task_struct_node(node);
if (!tsk)
return NULL; stack = alloc_thread_stack_node(tsk, node);
if (!stack)
goto free_tsk; stack_vm_area = task_stack_vm_area(tsk); err = arch_dup_task_struct(tsk, orig); /*
* arch_dup_task_struct() clobbers the stack-related fields. Make
* sure they're properly initialized before using any stack-related
* functions again.
*/
tsk->stack = stack;
#ifdef CONFIG_VMAP_STACK
tsk->stack_vm_area = stack_vm_area;
#endif
#ifdef CONFIG_THREAD_INFO_IN_TASK
atomic_set(&tsk->stack_refcount, 1);
#endif if (err)
goto free_stack; #ifdef CONFIG_SECCOMP
/*
* We must handle setting up seccomp filters once we're under
* the sighand lock in case orig has changed between now and
* then. Until then, filter must be NULL to avoid messing up
* the usage counts on the error path calling free_task.
*/
tsk->seccomp.filter = NULL;
#endif setup_thread_stack(tsk, orig);
clear_user_return_notifier(tsk);
clear_tsk_need_resched(tsk);
set_task_stack_end_magic(tsk); #ifdef CONFIG_STACKPROTECTOR
tsk->stack_canary = get_random_canary();
#endif /*
* One for us, one for whoever does the "release_task()" (usually
* parent)
*/
atomic_set(&tsk->usage, 2);
#ifdef CONFIG_BLK_DEV_IO_TRACE
tsk->btrace_seq = 0;
#endif
tsk->splice_pipe = NULL;
tsk->task_frag.page = NULL;
tsk->wake_q.next = NULL; account_kernel_stack(tsk, 1); kcov_task_init(tsk); #ifdef CONFIG_FAULT_INJECTION
tsk->fail_nth = 0;
#endif return tsk; free_stack:
free_thread_stack(tsk);
free_tsk:
free_task_struct(tsk);
return NULL;
} // 复制内存空间
static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
{
struct mm_struct *mm, *oldmm;
int retval; tsk->min_flt = tsk->maj_flt = 0;
tsk->nvcsw = tsk->nivcsw = 0;
#ifdef CONFIG_DETECT_HUNG_TASK
tsk->last_switch_count = tsk->nvcsw + tsk->nivcsw;
#endif tsk->mm = NULL;
tsk->active_mm = NULL; /*
* Are we cloning a kernel thread?
*
* We need to steal a active VM for that..
*/
oldmm = current->mm;
if (!oldmm)
return 0; /* initialize the new vmacache entries */
vmacache_flush(tsk);
// 一般情况下可以直接复用原有的内存信息
if (clone_flags & CLONE_VM) {
// 增加内存的使用计数,然后复用原有的指针即可
mmget(oldmm);
mm = oldmm;
goto good_mm;
} retval = -ENOMEM;
// 重量级操作,内存复制
mm = dup_mm(tsk);
if (!mm)
goto fail_nomem; good_mm:
tsk->mm = mm;
tsk->active_mm = mm;
return 0; fail_nomem:
return retval;
} /*
* Allocate a new mm structure and copy contents from the
* mm structure of the passed in task structure.
*/
static struct mm_struct *dup_mm(struct task_struct *tsk)
{
struct mm_struct *mm, *oldmm = current->mm;
int err; mm = allocate_mm();
if (!mm)
goto fail_nomem; memcpy(mm, oldmm, sizeof(*mm)); if (!mm_init(mm, tsk, mm->user_ns))
goto fail_nomem; err = dup_mmap(mm, oldmm);
if (err)
goto free_pt; mm->hiwater_rss = get_mm_rss(mm);
mm->hiwater_vm = mm->total_vm; if (mm->binfmt && !try_module_get(mm->binfmt->module))
goto free_pt; return mm; free_pt:
/* don't put binfmt in mmput, we haven't got module yet */
mm->binfmt = NULL;
mmput(mm); fail_nomem:
return NULL;
}

  针对复制过程,每个方法有其独特的用处,这也体现了设计模式中的单一职责原则。复制进程结构就是复制最外层的结构体,而复制内存,则是针对各区域的内存进行依次的复制。

4.2 将新建进程唤醒

  进程创建好之后,自然是要交给调度系统处理的。理论上,只需要将将新建的进程加入到调度队列,后续的事则可以不用fork() 管了。具体是否是这样呢?

// kernel/sched/core.c
// 将新建进程加入到调度队列,并设置唤醒标识
/*
* wake_up_new_task - wake up a newly created task for the first time.
*
* This function will do some initial scheduler statistics housekeeping
* that must be done for every newly created context, then puts the task
* on the runqueue and wakes it.
*/
void wake_up_new_task(struct task_struct *p)
{
struct rq_flags rf;
struct rq *rq; raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
p->state = TASK_RUNNING;
#ifdef CONFIG_SMP
/*
* Fork balancing, do it here and not earlier because:
* - cpus_allowed can change in the fork path
* - any previously selected CPU might disappear through hotplug
*
* Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq,
* as we're not fully set-up yet.
*/
p->recent_used_cpu = task_cpu(p);
__set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
#endif
rq = __task_rq_lock(p, &rf);
update_rq_clock(rq);
post_init_entity_util_avg(&p->se); activate_task(rq, p, ENQUEUE_NOCLOCK);
p->on_rq = TASK_ON_RQ_QUEUED;
trace_sched_wakeup_new(p);
check_preempt_curr(rq, p, WF_FORK);
#ifdef CONFIG_SMP
if (p->sched_class->task_woken) {
/*
* Nothing relies on rq->lock after this, so its fine to
* drop it.
*/
rq_unpin_lock(rq, &rf);
p->sched_class->task_woken(rq, p);
rq_repin_lock(rq, &rf);
}
#endif
task_rq_unlock(rq, p, &rf);
}
// 新进程加入队列
void activate_task(struct rq *rq, struct task_struct *p, int flags)
{
if (task_contributes_to_load(p))
rq->nr_uninterruptible--; enqueue_task(rq, p, flags);
} static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
{
if (!(flags & ENQUEUE_NOCLOCK))
update_rq_clock(rq); if (!(flags & ENQUEUE_RESTORE))
sched_info_queued(rq, p); p->sched_class->enqueue_task(rq, p, flags);
}

  

4.3 put_pid 是在管理pid吗?

  最后,我们看下put_pid() 都做了些什么?感觉像是将pid写入到全局空间中,进行统一管理,但是实际上却不是的,它实际上是一个引用计数的处理过程。

// kernel/pid.c
// 减少pid_namespace的引用计数
void put_pid(struct pid *pid)
{
struct pid_namespace *ns; if (!pid)
return; ns = pid->numbers[pid->level].ns;
if ((atomic_read(&pid->count) == 1) ||
atomic_dec_and_test(&pid->count)) {
kmem_cache_free(ns->pid_cachep, pid);
put_pid_ns(ns);
}
}
EXPORT_SYMBOL_GPL(put_pid);
// kernel/pid_namespace.c
void put_pid_ns(struct pid_namespace *ns)
{
struct pid_namespace *parent;
// 放到init进程pid空间
while (ns != &init_pid_ns) {
parent = ns->parent;
if (!kref_put(&ns->kref, free_pid_ns))
break;
ns = parent;
}
}
EXPORT_SYMBOL_GPL(put_pid_ns); static void free_pid_ns(struct kref *kref)
{
struct pid_namespace *ns; ns = container_of(kref, struct pid_namespace, kref);
destroy_pid_namespace(ns);
}

  

  创建进程线程,其实包含了大量的复杂操作,每个细节都值得我们深入研究,我们此处仅简单看几个方向,如内存的复制过程,也可以大致理解进线程的差别,相信也能在一定程度上为大家解惑。

关于linux的一点好奇心(五):进程线程的创建的更多相关文章

  1. 关于linux的一点好奇心(一):linux启动过程

    一直很好奇,操作系统是如何工作的?我们知道平时编程,是如何让代码跑起来的,但那些都是比较高层次的东西.越往后,你会越觉得,像是空中楼阁,或者说只是有人帮你铺平了许多道理,而你却对此一无所知. 1. 操 ...

  2. 关于linux的一点好奇心(四):tail -f文件跟踪实现

    关于文件跟踪,我们有很多的实际场景,比如查看某个系统日志的输出,当有变化时立即体现,以便进行问题排查:比如查看文件结尾的内容是啥,总之是刚需了. 1. 自己实现的文件跟踪 我们平时做功能开发时,也会遇 ...

  3. 线程概念( 线程的特点,进程与线程的关系, 线程和python理论知识,线程的创建)

    参考博客: https://www.cnblogs.com/xiao987334176/p/9041318.html 线程概念的引入背景 进程 之前我们已经了解了操作系统中进程的概念,程序并不能单独运 ...

  4. python 全栈开发,Day41(线程概念,线程的特点,进程和线程的关系,线程和python 理论知识,线程的创建)

    昨日内容回顾 队列 队列 : 先进先出.数据进程安全 队列实现方式: 管道 + 锁 生产者消费者模型 : 解决数据供需不平衡 管道 双向通信 数据进程不安全 EOFError: 管道是由操作系统进行引 ...

  5. python全栈开发,Day41(线程概念,线程的特点,进程和线程的关系,线程和python理论知识,线程的创建)

    昨日内容回顾 队列 队列:先进先出.数据进程安全 队列实现方式:管道+锁 生产者消费者模型:解决数据供需不平衡 管道 双向通信,数据进程不安全 EOFError: 管道是由操作系统进行引用计数的 必须 ...

  6. linux下c语言实现简单----线程池

    这两天刚好看完linux&c这本书的进程线程部分,学长建议可以用c语言实现一个简单的线程池,也是对线程知识的一个回顾与应用.线程的优点有好多,它是"轻量级的进程",所需资源 ...

  7. Py修行路 python基础 (二十五)线程与进程

    操作系统是用户和硬件沟通的桥梁 操作系统,位于底层硬件与应用软件之间的一层 工作方式:向下管理硬件,向上提供接口 操作系统进行切换操作: 把CPU的使用权切换给不同的进程. 1.出现IO操作 2.固定 ...

  8. Linux查看进程线程个数

    1.根据进程号进行查询: # pstree -p 进程号 # top -Hp 进程号 2.根据进程名字进行查询: # pstree -p `ps -e | grep server | awk '{pr ...

  9. Linux进程线程学习笔记:运行新程序

    Linux进程线程学习笔记:运行新程序                                         周银辉 在上一篇中我们说到,当启动一个新进程以后,新进程会复制父进程的大部份上下 ...

随机推荐

  1. 4-11 CS后台项目-4 及 Redis缓存数据

    使用Redis缓存数据 使用Redis可以提高查询效率,一定程度上可以减轻数据库服务器的压力,从而保护了数据库. 通常,应用Redis的场景有: 高频查询,例如:热搜列表.秒杀 改变频率低的数据,例如 ...

  2. 【每天学一点-02】创建Node.js的第一个应用

    1.引入require模块,使用createServer()创建服务器 [server.js]文件 var http = require('http'); http.createServer(func ...

  3. Python下载网易云收藏

    提前声明 仅作为个人学习使用,任何版权问题作者概不负责 本文的语言不会且不可能很严谨 博客园的编辑器有点BUG把我搞晕头了,所以本文可能有点鬼畜 前情 不知道各位有几个是对国内大厂的软件设计很满意的? ...

  4. 事务_基本演示和事务_默认自动提交&手动提交

    事务的基本介绍 概念: 如果一个包含多个步骤的业务操作,被事务管理,那么这些操作要么同时成功,要么同时失败 操作: 开启事务:start transaction; 回滚:rollback; 提交:co ...

  5. Docker非root用户使用

    Docker 用户管理 安装Docker后docker相关命令都需要加上sudo才能执行,这里为特定用户添加下权限 Docker群组 不过一般安好docker后该群组已创建 sudo groupadd ...

  6. JS中的数据类型及转换

    js的六大类型 js中有六种数据类型,Boolean: 布尔类型 Number:数字(整数int,浮点数float ) String:字符串 Object:对象 (包含Array数组 ) 特殊数据类型 ...

  7. 上穷碧落下凡尘:Win10系统下基于Docker配置Elasticsearch7配合Python3进行全文检索交互

    原文转载自「刘悦的技术博客」https://v3u.cn/a_id_166 基于文档式的全文检索引擎大家都不陌生,之前一篇文章:使用Redisearch实现的全文检索功能服务,曾经使用Rediseac ...

  8. MMDetection 使用示例:从入门到出门

    前言 最近对目标识别感兴趣,想做一些有趣目标识别项目自己玩耍,本来选择的是 YOLOV5 的,但无奈自己使用 YOLOV5 环境训练模型时,不管训练多少次 mAP 指标总是为 0,而其它 pytorc ...

  9. Auto.js pro 开发环境配置

    本文仅供学习交流使用,如侵立删!demo下载见文末 Auto.js pro 开发环境配置 准备: 1.Auto.js Pro Auto.js 已暂停维护 -下载链接放在了文章底部,有需要自行下载 2. ...

  10. 【原创】Python 极验滑块验证

    本文仅供学习交流使用,如侵立删! 记一次 极验滑块验证分析并通过 操作环境 win10 . mac Python3.9 selenium.seleniumwire 分析 最近在做的一个项目登录时会触发 ...