linux进程管理之进程创建
struct task_struct { volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ void *stack; atomic_t usage; unsigned int flags; /* per process flags, defined below */ unsigned int ptrace; int lock_depth; /* BKL lock depth */ …… …… }
#define GET_THREAD_INFO(reg) \ movl $THREAD, reg; \ andl %esp, reg THREAD_SIZE定义如下: #ifdef CONFIG_4KSTACKS #define THREAD_SIZE (4096) #else #define THREAD_SIZE (8192) #endif
struct task_struct init_task = INIT_TASK(init_task); #define INIT_TASK(tsk) \ { \ .state = 0, \ .stack = &init_thread_info, \ .usage = ATOMIC_INIT(2), \ …… …… .dirties = INIT_PROP_LOCAL_SINGLE(dirties), \ INIT_TRACE_IRQFLAGS \ INIT_LOCKDEP \ }
asmlinkage int sys_fork(struct pt_regs regs) { return do_fork(SIGCHLD, regs.esp, ®s, 0, NULL, NULL); } asmlinkage int sys_clone(struct pt_regs regs) { unsigned long clone_flags; unsigned long newsp; int __user *parent_tidptr, *child_tidptr; clone_flags = regs.ebx; newsp = regs.ecx; parent_tidptr = (int __user *)regs.edx; child_tidptr = (int __user *)regs.edi; if (!newsp) newsp = regs.esp; return do_fork(clone_flags, newsp, ®s, 0, parent_tidptr, child_tidptr); } asmlinkage int sys_vfork(struct pt_regs regs) { return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.esp, ®s, 0, NULL, NULL); }
从上面可以看出几种调用都会进入同一个接口:do_fork.不同的时,所带的标志不同/标志的含义如下:
#define SIGCHLD 17 #define CLONE_VM 0x00000100 /* set if VM shared between processes */ #define CLONE_VFORK 0x00004000 /* set if the parent wants the child to wake it up on mm_release */
long do_fork(unsigned long clone_flags, unsigned long stack_start, struct pt_regs *regs, unsigned long stack_size, int __user *parent_tidptr, int __user *child_tidptr) { struct task_struct *p; int trace = 0; //分配一个新的pid struct pid *pid = alloc_pid(); long nr; if (!pid) return -EAGAIN; nr = pid->nr; //如果当前进程被跟踪,子进程如果设置了相关被跟踪标志,则设置CLONE_PTRACE位 if (unlikely(current->ptrace)) { trace = fork_traceflag (clone_flags); if (trace) clone_flags |= CLONE_PTRACE; } //copy父进程的一些信息 p = copy_process(clone_flags, stack_start, regs, stack_size, parent_tidptr, child_tidptr, pid); if (!IS_ERR(p)) { struct completion vfork; //如果带有CLONE_VFORK标志.赋值并初始化vfork_done if (clone_flags & CLONE_VFORK) { p->vfork_done = &vfork; init_completion(&vfork); } //如果进子进程被跟踪,或者子进程初始化成STOP状态 //则发送SIGSTOP信号.由于子进程现在还没有运行,信号不能被处理 //所以设置TIF_SIGPENDING标志 if ((p->ptrace & PT_PTRACED) || (clone_flags & CLONE_STOPPED)) { /* * We'll start up with an immediate SIGSTOP. */ sigaddset(&p->pending.signal, SIGSTOP); set_tsk_thread_flag(p, TIF_SIGPENDING); } //如果子进程末定义CLONE_STOPPED标志,将其置为RUNNING.等待下一次调度 //否则将子进程状态更改为TASK_STOPPED if (!(clone_flags & CLONE_STOPPED)) wake_up_new_task(p, clone_flags); else p->state = TASK_STOPPED; //如果子进程被定义,通发送通告 if (unlikely (trace)) { current->ptrace_message = nr; ptrace_notify ((trace << 8) | SIGTRAP); } //如果定义了CLONE_VFORK标志.则将当前进程投入睡眠 if (clone_flags & CLONE_VFORK) { freezer_do_not_count(); wait_for_completion(&vfork); freezer_count(); if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE)) { current->ptrace_message = nr; ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP); } } } else { //如果copy父进程相关信息失败了.释放分配的pid free_pid(pid); nr = PTR_ERR(p); } return nr; } 我们在开始的时候分析过VFORK标志的作用,在这里我们注意一下VFORK标志的处理: long do_fork(unsigned long clone_flags, unsigned long stack_start, struct pt_regs *regs, unsigned long stack_size, int __user *parent_tidptr, int __user *child_tidptr) { …… …… /* static inline void init_completion(struct completion *x) { //done标志为0。表示子进程还没有将父进程唤醒 x->done = 0; //初始化一个等待队列 init_waitqueue_head(&x->wait); } */ if (clone_flags & CLONE_VFORK) { p->vfork_done = &vfork; init_completion(&vfork); } …… …… //如果定义了CLONE_VFORK标志.则将当前进程投入睡眠 if (clone_flags & CLONE_VFORK) { freezer_do_not_count(); wait_for_completion(&vfork); freezer_count(); if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE)) { current->ptrace_message = nr; ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP); } } …… } 跟踪一下wait_for_completion(): void fastcall __sched wait_for_completion(struct completion *x) { might_sleep(); spin_lock_irq(&x->wait.lock); if (!x->done) { //初始化一个等待队列 DECLARE_WAITQUEUE(wait, current); wait.flags |= WQ_FLAG_EXCLUSIVE; //将其加入到子进程的等待队列 __add_wait_queue_tail(&x->wait, &wait); do { //设置进程状态为TASK_UNINTERRUPTIBLE __set_current_state(TASK_UNINTERRUPTIBLE); spin_unlock_irq(&x->wait.lock); //重新调度 //一般来说,在这里的时候就会退出当前进程,去调度另外的进程,直到被子进程唤醒 schedule(); spin_lock_irq(&x->wait.lock); } while (!x->done); //一直到x->done标志被设置。这里是为了防止异常情况将进程唤醒 //从等待队列中移除 __remove_wait_queue(&x->wait, &wait); } x->done--; spin_unlock_irq(&x->wait.lock); } 接着分析do_fork(),copy_proces()是它的核心函数。重点分析一下: static struct task_struct *copy_process(unsigned long clone_flags, unsigned long stack_start, struct pt_regs *regs, unsigned long stack_size, int __user *parent_tidptr, int __user *child_tidptr, struct pid *pid) { int retval; struct task_struct *p = NULL; //clone_flags参数的有效性判断 //不能同时定义CLONE_NEWNS,CLONE_FS if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) return ERR_PTR(-EINVAL); //如果定义CLONE_THREAD,则必须要定义CLONE_SIGHAND if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND)) return ERR_PTR(-EINVAL); //如果定义CLONE_SIGHAND,则必须要定义CLONE_VM if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM)) return ERR_PTR(-EINVAL); retval = security_task_create(clone_flags); if (retval) goto fork_out; retval = -ENOMEM; //从父进程中复制出一个task p = dup_task_struct(current); if (!p) goto fork_out; rt_mutex_init_task(p); #ifdef CONFIG_TRACE_IRQFLAGS DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled); DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); #endif retval = -EAGAIN; //如果用户的进程总数超过了限制 if (atomic_read(&p->user->processes) >= p->signal->rlim[RLIMIT_NPROC].rlim_cur) { if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) && p->user != current->nsproxy->user_ns->root_user) goto bad_fork_free; } //更新进程用户的相关计数 atomic_inc(&p->user->__count); atomic_inc(&p->user->processes); get_group_info(p->group_info); //当前进程数是否大于系统规定的最大进程数 if (nr_threads >= max_threads) goto bad_fork_cleanup_count; //加载进程的相关执行模块 if (!try_module_get(task_thread_info(p)->exec_domain->module)) goto bad_fork_cleanup_count; if (p->binfmt && !try_module_get(p->binfmt->module)) goto bad_fork_cleanup_put_domain; //子进程还在进行初始化,没有execve p->did_exec = 0; delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ //copy父进程的所有标志,除了PF_SUPERPRIV(超级权限) //置子进程的PF_FORKNOEXEC标志,表示正在被FORK copy_flags(clone_flags, p); //赋值子进程的pid p->pid = pid_nr(pid); retval = -EFAULT; if (clone_flags & CLONE_PARENT_SETTID) if (put_user(p->pid, parent_tidptr)) goto bad_fork_cleanup_delays_binfmt; //初始化子进程的几个链表 INIT_LIST_HEAD(&p->children); INIT_LIST_HEAD(&p->sibling); p->vfork_done = NULL; spin_lock_init(&p->alloc_lock); //父进程的TIF_SIGPENDING被复制进了子进程,这个标志表示有末处理的信号 //这个标志子进程是不需要的 clear_tsk_thread_flag(p, TIF_SIGPENDING); init_sigpending(&p->pending); //初始化子进程的time p->utime = cputime_zero; p->stime = cputime_zero; p->prev_utime = cputime_zero; …… …… //tgid = pid p->tgid = p->pid; if (clone_flags & CLONE_THREAD) p->tgid = current->tgid; //copy父进程的其它资源.比例打开的文件,信号,VM等等 if ((retval = security_task_alloc(p))) goto bad_fork_cleanup_policy; if ((retval = audit_alloc(p))) goto bad_fork_cleanup_security; /* copy all the process information */ if ((retval = copy_semundo(clone_flags, p))) goto bad_fork_cleanup_audit; if ((retval = copy_files(clone_flags, p))) goto bad_fork_cleanup_semundo; if ((retval = copy_fs(clone_flags, p))) goto bad_fork_cleanup_files; if ((retval = copy_sighand(clone_flags, p))) goto bad_fork_cleanup_fs; if ((retval = copy_signal(clone_flags, p))) goto bad_fork_cleanup_sighand; if ((retval = copy_mm(clone_flags, p))) goto bad_fork_cleanup_signal; if ((retval = copy_keys(clone_flags, p))) goto bad_fork_cleanup_mm; if ((retval = copy_namespaces(clone_flags, p))) goto bad_fork_cleanup_keys; retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs); if (retval) goto bad_fork_cleanup_namespaces; p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; /* * Clear TID on mm_release()? */ p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL; p->robust_list = NULL; #ifdef CONFIG_COMPAT p->compat_robust_list = NULL; #endif INIT_LIST_HEAD(&p->pi_state_list); p->pi_state_cache = NULL; /* * sigaltstack should be cleared when sharing the same VM */ if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM) p->sas_ss_sp = p->sas_ss_size = 0; /* * Syscall tracing should be turned off in the child regardless * of CLONE_PTRACE. */ clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE); #ifdef TIF_SYSCALL_EMU clear_tsk_thread_flag(p, TIF_SYSCALL_EMU); #endif /* Our parent execution domain becomes current domain These must match for thread signalling to apply */ p->parent_exec_id = p->self_exec_id; /* ok, now we should be set up.. */ //exit_signal: 子进程退出时给父进程发送的信号 p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL); //pdeath_signal:进程退出时.给其下的子进程发送的信号 p->pdeath_signal = 0; p->exit_state = 0; …… …… if (likely(p->pid)) { add_parent(p); if (unlikely(p->ptrace & PT_PTRACED)) __ptrace_link(p, current->parent); if (thread_group_leader(p)) { p->signal->tty = current->signal->tty; p->signal->pgrp = process_group(current); set_signal_session(p->signal, process_session(current)); attach_pid(p, PIDTYPE_PGID, task_pgrp(current)); attach_pid(p, PIDTYPE_SID, task_session(current)); list_add_tail_rcu(&p->tasks, &init_task.tasks); __get_cpu_var(process_counts)++; } attach_pid(p, PIDTYPE_PID, pid); //当前进程数递增 nr_threads++; } //被fork的进程数计数递增 total_forks++; spin_unlock(¤t->sighand->siglock); write_unlock_irq(&tasklist_lock); proc_fork_connector(p); return p; …… …… }
static struct task_struct *dup_task_struct(struct task_struct *orig) { struct task_struct *tsk; struct thread_info *ti; //保存FPU信息,并设置TS标志 prepare_to_copy(orig); //分配一个进程描述符 tsk = alloc_task_struct(); if (!tsk) return NULL; //分配thread_info ti = alloc_thread_info(tsk); if (!ti) { //如果分配thread_info失败.则释放分配的task free_task_struct(tsk); return NULL; } //复制task信息 *tsk = *orig; //使task->stack指向thread_info tsk->stack = ti; //copy父进程的thread_info信息 //并使thread_info.task指向task setup_thread_stack(tsk, orig); #ifdef CONFIG_CC_STACKPROTECTOR tsk->stack_canary = get_random_int(); #endif /* One for us, one for whoever does the "release_task()" (usually parent) */ atomic_set(&tsk->usage,2); atomic_set(&tsk->fs_excl, 0); #ifdef CONFIG_BLK_DEV_IO_TRACE tsk->btrace_seq = 0; #endif tsk->splice_pipe = NULL; return tsk; }
void prepare_to_copy(struct task_struct *tsk) { unlazy_fpu(tsk); } Unlazy_fpu() à __unlazy_fpu(): #define __unlazy_fpu( tsk ) do { \ //如果使用了MMX,M,FPU寄存器 if (task_thread_info(tsk)->status & TS_USEDFPU) { \ //保存相关寄存器 __save_init_fpu(tsk); \ //设置TS stts(); \ } else \ tsk->fpu_counter = 0; \ } while (0)
static int copy_mm(unsigned long clone_flags, struct task_struct * tsk) { struct mm_struct * mm, *oldmm; int retval; //初始化task中与VMA有关的成员 tsk->min_flt = tsk->maj_flt = 0; tsk->nvcsw = tsk->nivcsw = 0; //task是从父进程COPY过来的,所以将mm.active_mm设成NULL tsk->mm = NULL; tsk->active_mm = NULL; /* * Are we cloning a kernel thread? * * We need to steal a active VM for that.. */ oldmm = current->mm; if (!oldmm) return 0; //如果设置了CLONE_VM标志,也就是父子进程共享同一个内存空间 //只要增加父进程的MM引用计数即可 if (clone_flags & CLONE_VM) { atomic_inc(&oldmm->mm_users); mm = oldmm; goto good_mm; } //如果没有定义CLONE_VM.那就将父进程的VM复制过来.增加映射的页面的使用 //计数,并且将页面设为只读.如果父子进程中任意一个去改写页面,就会产生一个 //页面异常,由do_page_fault分配一个新的页面.并将旧页面的只读标志去了 //详情请参考本站的另一篇文章《linux内存管理之页面异常处理》 retval = -ENOMEM; mm = dup_mm(tsk); if (!mm) goto fail_nomem; good_mm: /* Initializing for Swap token stuff */ mm->token_priority = 0; mm->last_interval = 0; //设置task的mm,active_mm字段 tsk->mm = mm; tsk->active_mm = mm; return 0; fail_nomem: return retval; }
先思考一个问题,复制父进程的映射关系时,要不要把父进程的映射关系全部都COPY过来呢?其实它对于父进程的内核空间映射,子进程是不需要的。所以只需要将父进程的用户空间的映射关系复制过来即可。接着看代码。Dup_mm的实现如下所示:
static struct mm_struct *dup_mm(struct task_struct *tsk) { struct mm_struct *mm, *oldmm = current->mm; int err; //如果当前进程的MM不存在,出错退出 if (!oldmm) return NULL; //为mm为配一个存储空间 mm = allocate_mm(); if (!mm) goto fail_nomem; //复制当前进程的mm memcpy(mm, oldmm, sizeof(*mm)); /* Initializing for Swap token stuff */ mm->token_priority = 0; mm->last_interval = 0; //mm初始化 if (!mm_init(mm)) goto fail_nomem; if (init_new_context(tsk, mm)) goto fail_nocontext; //具体的复制过程 err = dup_mmap(mm, oldmm); if (err) goto free_pt; mm->hiwater_rss = get_mm_rss(mm); mm->hiwater_vm = mm->total_vm; return mm; free_pt: mmput(mm); fail_nomem: return NULL; fail_nocontext: /* * If init_new_context() failed, we cannot use mmput() to free the mm * because it calls destroy_context() */ mm_free_pgd(mm); free_mm(mm); return NULL; }
我们先来看一下mm的初始化。它是在mm_init中完成的。代码如下:
static struct mm_struct * mm_init(struct mm_struct * mm) { //初始化mm相关字段 atomic_set(&mm->mm_users, 1); atomic_set(&mm->mm_count, 1); init_rwsem(&mm->mmap_sem); INIT_LIST_HEAD(&mm->mmlist); mm->flags = (current->mm) ? current->mm->flags : MMF_DUMP_FILTER_DEFAULT; mm->core_waiters = 0; mm->nr_ptes = 0; set_mm_counter(mm, file_rss, 0); set_mm_counter(mm, anon_rss, 0); spin_lock_init(&mm->page_table_lock); rwlock_init(&mm->ioctx_list_lock); mm->ioctx_list = NULL; mm->free_area_cache = TASK_UNMAPPED_BASE; mm->cached_hole_size = ~0UL; //为子进程分配并初始PGD if (likely(!mm_alloc_pgd(mm))) { mm->def_flags = 0; return mm; } free_mm(mm); return NULL; } Mm_alloc_pgd的实现如下: static inline int mm_alloc_pgd(struct mm_struct * mm) { mm->pgd = pgd_alloc(mm); if (unlikely(!mm->pgd)) return -ENOMEM; return 0; } pgd_t *pgd_alloc(struct mm_struct *mm) { int i; pgd_t *pgd = quicklist_alloc(0, GFP_KERNEL, pgd_ctor); if (PTRS_PER_PMD == 1 || !pgd) return pgd; //从0开始到UNSHARED_PTRS_PER_PGD,建立PGD->PMD的映射 for (i = 0; i < UNSHARED_PTRS_PER_PGD; ++i) { pmd_t *pmd = pmd_cache_alloc(i); if (!pmd) goto out_oom; paravirt_alloc_pd(__pa(pmd) >> PAGE_SHIFT); set_pgd(&pgd[i], __pgd(1 + __pa(pmd))); } return pgd; out_oom: for (i--; i >= 0; i--) { pgd_t pgdent = pgd[i]; void* pmd = (void *)__va(pgd_val(pgdent)-1); paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT); pmd_cache_free(pmd, i); } quicklist_free(0, pgd_dtor, pgd); return NULL; }
static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) { struct vm_area_struct *mpnt, *tmp, **pprev; struct rb_node **rb_link, *rb_parent; int retval; unsigned long charge; struct mempolicy *pol; //防止并发操作,加锁 down_write(&oldmm->mmap_sem); //在x86上,这个函数为空函数 flush_cache_dup_mm(oldmm); /* * Not linked in yet - no deadlock potential: */ //加锁 down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING); mm->locked_vm = 0; mm->mmap = NULL; mm->mmap_cache = NULL; mm->free_area_cache = oldmm->mmap_base; mm->cached_hole_size = ~0UL; mm->map_count = 0; cpus_clear(mm->cpu_vm_mask); mm->mm_rb = RB_ROOT; rb_link = &mm->mm_rb.rb_node; rb_parent = NULL; pprev = &mm->mmap; //遍历父进程的vma, 将其copy到子进程 for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { struct file *file; if (mpnt->vm_flags & VM_DONTCOPY) { long pages = vma_pages(mpnt); mm->total_vm -= pages; vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file, -pages); continue; } charge = 0; if (mpnt->vm_flags & VM_ACCOUNT) { unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT; if (security_vm_enough_memory(len)) goto fail_nomem; charge = len; } tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); if (!tmp) goto fail_nomem; *tmp = *mpnt; pol = mpol_copy(vma_policy(mpnt)); retval = PTR_ERR(pol); if (IS_ERR(pol)) goto fail_nomem_policy; vma_set_policy(tmp, pol); tmp->vm_flags &= ~VM_LOCKED; tmp->vm_mm = mm; tmp->vm_next = NULL; anon_vma_link(tmp); file = tmp->vm_file; //映射到了一个文件 if (file) { struct inode *inode = file->f_path.dentry->d_inode; get_file(file); if (tmp->vm_flags & VM_DENYWRITE) atomic_dec(&inode->i_writecount); /* insert tmp into the share list, just after mpnt */ spin_lock(&file->f_mapping->i_mmap_lock); tmp->vm_truncate_count = mpnt->vm_truncate_count; flush_dcache_mmap_lock(file->f_mapping); vma_prio_tree_add(tmp, mpnt); flush_dcache_mmap_unlock(file->f_mapping); spin_unlock(&file->f_mapping->i_mmap_lock); } /* * Link in the new vma and copy the page table entries. */ *pprev = tmp; pprev = &tmp->vm_next; //插入到mm的vma树 __vma_link_rb(mm, tmp, rb_link, rb_parent); rb_link = &tmp->vm_rb.rb_right; rb_parent = &tmp->vm_rb; //递增子进程的map_count计数 mm->map_count++; //COPY具体的映射关系 retval = copy_page_range(mm, oldmm, mpnt); //由于VMA新加入MM. 如果VMA有open操作,运行之 if (tmp->vm_ops && tmp->vm_ops->open) tmp->vm_ops->open(tmp); if (retval) goto out; } /* a new mm has just been created */ arch_dup_mmap(oldmm, mm); retval = 0; out: up_write(&mm->mmap_sem); flush_tlb_mm(oldmm); up_write(&oldmm->mmap_sem); return retval; fail_nomem_policy: kmem_cache_free(vm_area_cachep, tmp); fail_nomem: retval = -ENOMEM; vm_unacct_memory(charge); goto out; }
//copy src_mm的页面映射关面到dst_mm int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, struct vm_area_struct *vma) { pgd_t *src_pgd, *dst_pgd; unsigned long next; unsigned long addr = vma->vm_start; unsigned long end = vma->vm_end; /* * Don't copy ptes where a page fault will fill them correctly. * Fork becomes much lighter when there are big shared or private * readonly mappings. The tradeoff is that copy_page_range is more * efficient than faulting. */ if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_PFNMAP|VM_INSERTPAGE))) { if (!vma->anon_vma) return 0; } //没有定义CONFIG_HUGETLB_PAGE标志的时候,这个函数返回0 if (is_vm_hugetlb_page(vma)) return copy_hugetlb_page_range(dst_mm, src_mm, vma); dst_pgd = pgd_offset(dst_mm, addr); src_pgd = pgd_offset(src_mm, addr); do { next = pgd_addr_end(addr, end); //src_pgd没有映射? if (pgd_none_or_clear_bad(src_pgd)) continue; //pud是新加的一个四层映射 if (copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd, vma, addr, next)) return -ENOMEM; } while (dst_pgd++, src_pgd++, addr = next, addr != end); return 0; }
从pgd->pud->pmd->pte.一直沿着映射关系到PTE。因为我们在前面说过,初始化mm的时候,已经建立了从PGD到PMD的映射关系,我们直接转到PTE项的处理,它是在copy_page_range()—>copy_pud_range()àcopy_pmd_range()àcopy_pte_range():
static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, unsigned long addr, unsigned long end) { pte_t *src_pte, *dst_pte; spinlock_t *src_ptl, *dst_ptl; int progress = 0; int rss[2]; again: rss[1] = rss[0] = 0; //申请pte dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl); if (!dst_pte) return -ENOMEM; src_pte = pte_offset_map_nested(src_pmd, addr); src_ptl = pte_lockptr(src_mm, src_pmd); spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); arch_enter_lazy_mmu_mode(); do { /* * We are holding two locks at this point - either of them * could generate latencies in another task on another CPU. */ if (progress >= 32) { progress = 0; if (need_resched() || need_lockbreak(src_ptl) || need_lockbreak(dst_ptl)) break; } //源pte没有映射到具体页面 if (pte_none(*src_pte)) { progress++; continue; } //copy页面到dst_pte copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss); progress += 8; } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); arch_leave_lazy_mmu_mode(); spin_unlock(src_ptl); pte_unmap_nested(src_pte - 1); add_mm_rss(dst_mm, rss[0], rss[1]); pte_unmap_unlock(dst_pte - 1, dst_ptl); cond_resched(); if (addr != end) goto again; return 0; }
copy_one_pte()的代码如下:
static inline void copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma, unsigned long addr, int *rss) { unsigned long vm_flags = vma->vm_flags; pte_t pte = *src_pte; struct page *page; /* pte contains position in swap or file, so copy. */ //映射的页面不在内存,可能被交换出去了 if (unlikely(!pte_present(pte))) { if (!pte_file(pte)) { swp_entry_t entry = pte_to_swp_entry(pte); swap_duplicate(entry); /* make sure dst_mm is on swapoff's mmlist. */ if (unlikely(list_empty(&dst_mm->mmlist))) { spin_lock(&mmlist_lock); if (list_empty(&dst_mm->mmlist)) list_add(&dst_mm->mmlist, &src_mm->mmlist); spin_unlock(&mmlist_lock); } if (is_write_migration_entry(entry) && is_cow_mapping(vm_flags)) { /* * COW mappings require pages in both parent * and child to be set to read. */ make_migration_entry_read(&entry); pte = swp_entry_to_pte(entry); set_pte_at(src_mm, addr, src_pte, pte); } } goto out_set_pte; } /* * If it's a COW mapping, write protect it both * in the parent and the child */ //只有在is_cow_mapping()为真的情况下,才会设置成可写权限 if (is_cow_mapping(vm_flags)) { ptep_set_wrprotect(src_mm, addr, src_pte); pte = pte_wrprotect(pte); } /* * If it's a shared mapping, mark it clean in * the child */ if (vm_flags & VM_SHARED) pte = pte_mkclean(pte); pte = pte_mkold(pte); //找到具体的映射页面 page = vm_normal_page(vma, addr, pte); //找到了映射的页面 if (page) { //递增页面的引用计数 get_page(page); page_dup_rmap(page, vma, addr); rss[!!PageAnon(page)]++; } out_set_pte: //将dst_pte映射到这个页面 set_pte_at(dst_mm, addr, dst_pte, pte); }
int copy_thread(int nr, unsigned long clone_flags, unsigned long esp, unsigned long unused, struct task_struct * p, struct pt_regs * regs) { struct pt_regs * childregs; struct task_struct *tsk; int err; //子进程的内核堆栈起点 childregs = task_pt_regs(p); //将父进程的regs参数赋值到子进程的内核堆栈 //regs参数:里面存放的是父进程陷入内核后的各寄存器的值 *childregs = *regs; //eax:返回值. 将其设为0,子进程返回到用户空间后,它的返回值是0 childregs->eax = 0; //esp:子进程的用户堆栈指针位置 childregs->esp = esp; //子进程内核堆栈位置 p->thread.esp = (unsigned long) childregs; //子进程内核堆栈指针位置 p->thread.esp0 = (unsigned long) (childregs+1); //子进程要执行的下一条指令.对应子进程从系统空间返回用户空间 p->thread.eip = (unsigned long) ret_from_fork; savesegment(gs,p->thread.gs); tsk = current; if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) { p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr, IO_BITMAP_BYTES, GFP_KERNEL); if (!p->thread.io_bitmap_ptr) { p->thread.io_bitmap_max = 0; return -ENOMEM; } set_tsk_thread_flag(p, TIF_IO_BITMAP); } /* * Set a new TLS for the child thread? */ if (clone_flags & CLONE_SETTLS) { struct desc_struct *desc; struct user_desc info; int idx; err = -EFAULT; if (copy_from_user(&info, (void __user *)childregs->esi, sizeof(info))) goto out; err = -EINVAL; if (LDT_empty(&info)) goto out; idx = info.entry_number; if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) goto out; desc = p->thread.tls_array + idx - GDT_ENTRY_TLS_MIN; desc->a = LDT_entry_a(&info); desc->b = LDT_entry_b(&info); } err = 0; out: if (err && p->thread.io_bitmap_ptr) { kfree(p->thread.io_bitmap_ptr); p->thread.io_bitmap_max = 0; } return err; }
#define task_pt_regs(task) \ ({ \ struct pt_regs *__regs__; \ // #define task_stack_page(task) ((task)->stack) 即thrad_info __regs__ = (struct pt_regs *)(KSTK_TOP(task_stack_page(task))-8); \ __regs__ - 1; \ }) #define KSTK_TOP(info) \ ({ \ unsigned long *__ptr = (unsigned long *)(info); \ //即:__ptr += THREAD_SIZE //#define THREAD_SIZE_LONGS (THREAD_SIZE/sizeof(unsigned long)) (unsigned long)(&__ptr[THREAD_SIZE_LONGS]); \ })
linux进程管理之进程创建的更多相关文章
- Linux进程管理 (1)进程的诞生
专题:Linux进程管理专题 目录: Linux进程管理 (1)进程的诞生 Linux进程管理 (2)CFS调度器 Linux进程管理 (3)SMP负载均衡 Linux进程管理 (4)HMP调度器 L ...
- Linux 内核进程管理之进程ID 。图解
http://www.cnblogs.com/hazir/tag/kernel/ Linux 内核进程管理之进程ID Linux 内核使用 task_struct 数据结构来关联所有与进程有关的数 ...
- linux进程管理之进程创建(三)
在linux系统中,许多进程在诞生之初都与其父进程共同用一个存储空间.但是子进程又可以建立自己的存储空间,并与父进程“分道扬镳”,成为与父进程一样真正意义上的进程. linux系统运行的第一个进程是在 ...
- Linux 内核进程管理之进程ID
Linux 内核使用 task_struct 数据结构来关联所有与进程有关的数据和结构,Linux 内核所有涉及到进程和程序的所有算法都是围绕该数据结构建立的,是内核中最重要的数据结构之一.该数据结构 ...
- Linux内存管理 一个进程究竟占用多少空间?-VSS/RSS/PSS/USS
关键词:VSS.RSS.PSS.USS._mapcount.pte_present.mem_size_stats. 在Linux里面,一个进程占用的内存有不同种说法,可以是VSS/RSS/PSS/US ...
- Linux 内核进程管理之进程ID【转】
转自:http://www.cnblogs.com/hazir/p/linux_kernel_pid.html Linux 内核使用 task_struct 数据结构来关联所有与进程有关的数据和结构, ...
- 【Android手机测试】linux内存管理 -- 一个进程占多少内存?四种计算方法:VSS/RSS/PSS/USS
在Linux里面,一个进程占用的内存有不同种说法,可以是VSS/RSS/PSS/USS四种形式,这四种形式首字母分别是Virtual/Resident/Proportional/Unique的意思. ...
- linux进程管理之进程查看
查看进程 process ====================================================================================了解如 ...
- 【Linux卷管理】LVM创建与管理
安装LVM 首先确定系统中是否安装了lvm工具: [root@jetsen ~]# rpm -qa|grep lvm system-config-lvm-1.1.5-1.0.el5 lvm2-2.02 ...
随机推荐
- 安卓高级6 SnackBar
引言 文/李牧羊(简书作者) 原文链接:http://www.jianshu.com/p/2654e6bda3b1 著作权归作者所有,转载请联系作者获得授权,并标注"简书作者". ...
- log4j日志记录级别是如何工作?
级别p的级别使用q,在记录日志请求时,如果p>=q启用.这条规则是log4j的核心.它假设级别是有序的.对于标准级别它们关系如下:ALL < DEBUG < INFO < WA ...
- JAVA面向对象-----访问修饰符
访问修饰符是用来控制类.属性.方法的可见性的关键字称之为访问修饰符. 1.public 一个类中,同一包中,子类中,不同包中 2.protected 一个类中,同一包中,子类中 3.default 一 ...
- spark下使用submit提交任务后报jar包已存在错误
使用spark submit进行任务提交,离线跑数据,提交后的一段时间内可以application可以正常运行.过了一段时间后,就抛出以下错误: org.apache.spark.SparkExcep ...
- CoreText精彩文字轮廓绘制动画的一点改进
大熊猫猪·侯佩原创或翻译作品.欢迎转载,转载请注明出处. 如果觉得写的不好请多提意见,如果觉得不错请多多支持点赞.谢谢! hopy ;) 原文在: http://oleb.net/blog/2010/ ...
- 有n个数,输出其中所有和为s的k个数的组合。
分析:此题有两个坑,一是这里的n个数是任意给定的,不一定是:1,2,3...n,所以可能有重复的数(如果有重复的数怎么处理?):二是不要求你输出所有和为s的全部组合,而只要求输出和为s的k个数的组合. ...
- FFmpeg的H.264解码器源代码简单分析:概述
===================================================== H.264源代码分析文章列表: [编码 - x264] x264源代码简单分析:概述 x26 ...
- 详解EBS接口开发之销售订单导入
步骤 1. 创建一个订单导入来源. - 导航到 OM -> 设置 -> 订单 -> 导入来源 - 输入一个新的订单导入来源名称和描述 - 选择启用来激活 ...
- expect 简单使用
简单的登陆脚本 这样就不用每次都输入ssh命令了,使用密码还是有些不安全,谨慎使用. #!/usr/bin/expect -f #filename: auto_login.sh #author: or ...
- 使用Swift开发一个MacOS的菜单状态栏App
猴子原创,欢迎转载.转载请注明: 转载自Cocos2Der-CSDN,谢谢! 原文地址: http://blog.csdn.net/cocos2der/article/details/52054107 ...