
  1. <mm_types.h>
  2. <mm_types.h> struct mm_struct {
  3. ...
  4. unsigned long (*get_unmapped_area) (struct file *filp,unsigned long addr, unsigned long len,unsigned long pgoff, unsigned long flags);
  5. ...
  6. unsigned long mmap_base; /* mmap区域的基地址 */
  7. unsigned long task_size; /* 进程虚拟内存空间的长度 */
  8. ...
  9. unsigned long start_code, end_code, start_data, end_data;//包含可执行代码区域,和已初始化数据区域(在ELF二进制文件映射到地址空间中之后,这些区域的长度不再改变)
  10. unsigned long start_brk, brk, start_stack; //堆的区域,分配堆时,只改变brk值
  11. unsigned long arg_start, arg_end, env_start, env_end; //两个区域都位于栈中最高的区域
  12. ...
  13. }


  • IA-32:0x0804 8000
  • AMD64:0x0000 0000 0040 0000



  1. void arch_pick_mmap_layout(struct mm_struct *mm)
  2. {
  4. if (current_thread_info()->flags & _TIF_IA32)
  5. return ia32_pick_mmap_layout(mm);//该函数实际上是IA-32系统上arch_pick_mmap_layout的一个相同副本
  6. #endif
  7. mm->mmap_base = TASK_UNMAPPED_BASE;
  8. if (current->flags & PF_RANDOMIZE) {
  9. /* Add 28bit randomness which is about 40bits of address space
  10. because mmap base has to be page aligned.
  11. or ~1/128 of the total user VM
  12. (total user address space is 47bits) */
  13. /*
  14. 最初生成的随机偏移量是28位,因为mmap基地址必须对齐到页,因此将该值左移PAGE_SHIFT位(12),最后的偏移是40位。大约是用户虚拟内存总量的1/128
  15. */
  16. unsigned rnd = get_random_int() & 0xfffffff;
  17. mm->mmap_base += ((unsigned long)rnd) << PAGE_SHIFT;
  18. }
  19. mm->get_unmapped_area = arch_get_unmapped_area;
  20. mm->unmap_area = arch_unmap_area;
  21. }




按需分配和填充页称之为按需调页法(demand paging):

  • 进程试图访问用户地址空间中的一个内存地址,但使用页表无法确定物理地址(物理内存中没有关联页)。
  • 处理器触发缺页异常,发送到内核
  • 内核会检查负责缺页区域的进程地址空间数据结构,找到适当的后备存储器,或者确认该访问实际上是不正确的。
  • 分配物理页,并从后备存储器读取所需数据填充
  • 借助于页表将物理内存并入到用户进程的地址空间,应用程序恢复执行。



struct mm_struct很重要,按前文的讨论,该结构提供了进程在内存中布局所有必要信息。另外,它还包括下列成员,用于管理用户进程在虚拟地址空间中的所有内存区域。

  1. struct mm_struct {
  2. struct vm_area_struct * mmap; //虚拟内存区域列表
  3. struct rb_root mm_rb;
  4. struct vm_area_struct * mmap_cache; //上一次find_vma结构
  5. }



  • 在一个单链表上(开始于mm_struct->mmap)。
  • 在一个红黑树中,根节点位于mm_rb。


  1. struct vm_area_struct {
  2. struct mm_struct * vm_mm; /* The address space we belong to. *///所属地址空间
  3. unsigned long vm_start; /* Our start address within vm_mm. *///vm_mm内的起始地址
  4. unsigned long vm_end; /* The first byte after our end address///在vm_mm内结束地址之后的第一个字节的地址
  5. within vm_mm. */
  6. /* linked list of VM areas per task, sorted by address */
  7. struct vm_area_struct *vm_next;//各进程的虚拟内存区域链表,按地址排序
  8. pgprot_t vm_page_prot; /* Access permissions of this VMA. *///该虚拟内存区域的访问权限
  9. unsigned long vm_flags; /* Flags, listed below. *///标志
  10. struct rb_node vm_rb;
  11. /*
  12. * For areas with an address space and backing store,
  13. * linkage into the address_space->i_mmap prio tree, or
  14. * linkage to the list of like vmas hanging off its node, or
  15. * linkage of vma in the address_space->i_mmap_nonlinear list.
  16. */
  17. /*
  18. 对于有地址空间和后备存储器的区域来说
  19. shared连接到address_space->i_mmap优先树,
  20. 或连接到悬挂在优先树结点之外、类似的一组虚拟内存区域的链表,
  21. 或连接到address_space->i_mmap_nonlinear链表中的虚拟内存区域。
  22. */
  23. union {
  24. struct {
  25. struct list_head list;
  26. void *parent; /* aligns with prio_tree_node parent */
  27. struct vm_area_struct *head;
  28. } vm_set;
  29. struct raw_prio_tree_node prio_tree_node;
  30. } shared;
  31. /*
  32. * A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma
  33. * list, after a COW of one of the file pages. A MAP_SHARED vma
  34. * can only be in the i_mmap tree. An anonymous MAP_PRIVATE, stack
  35. * or brk vma (with NULL file) can only be in an anon_vma list.
  36. */
  37. /*
  38. *在文件的某一页经过写时复制之后,文件的MAP_PRIVATE虚拟内存区域可能同时在i_mmap树和
  39. *anon_vma链表中。MAP_SHARED虚拟内存区域只能在i_mmap树中。
  40. * 匿名的MAP_PRIVATE、栈或brk虚拟内存区域(file指针为NULL)只能处于anon_vma链表中。
  41. */
  42. //anon_vma_node和anon_vma用于管理源自匿名映射(anonymous mapping)的共享页
  43. struct list_head anon_vma_node; /* Serialized by anon_vma->lock */
  44. struct anon_vma *anon_vma; /* Serialized by page_table_lock */
  45. /* Function pointers to deal with this struct. */
  46. struct vm_operations_struct * vm_ops;
  47. /* Information about our backing store: */
  48. unsigned long vm_pgoff; /* Offset (within vm_file) in PAGE_SIZE units, *not* PAGE_CACHE_SIZE *///指定文件映射的偏移量(单位页)
  49. struct file * vm_file; /* File we map to (can be NULL). */
  50. void * vm_private_data; /* was vm_pte (shared mem) */
  51. unsigned long vm_truncate_count;/* truncate_count or restart_addr */
  52. #ifndef CONFIG_MMU
  53. atomic_t vm_usage; /* refcount (VMAs shared if !MMU) */
  54. #endif
  55. #ifdef CONFIG_NUMA
  56. struct mempolicy *vm_policy; /* NUMA policy for the VMA */
  57. #endif
  58. };




每个打开文件都表示为struct file的一个实列。该结构包含了一个指向地址空间对象struct address_space的指针。该对象是优先查找树的基础,而文件区间与其映射到的地址空间之间的关联即通过优先树建立。

  1. <fs.h>
  2. struct address_space {
  3. struct inode *host; /* owner: inode, block_device */
  4. ...
  5. struct prio_tree_root i_mmap; /* 私有和共享映射的树 */
  6. struct list_head i_mmap_nonlinear;/*VM_NONLINEAR映射的链表 */
  7. ...
  8. }
  9. <fs.h>
  10. struct file {
  11. ...
  12. struct address_space *f_mapping;
  13. ...
  14. }
  15. <fs.h>
  16. struct inode {
  17. ...
  18. struct address_space *i_mapping;
  19. ...
  20. }

优先树是地址空间的基本要素,而优先树包含了所有相关的vm_area_struct实列,描述了与inode 关联的文件区间到一些虚拟地址空间的映射。

记住:一个给定的struct vm_area实列,可以包含在两个数据结构中。一个建立进程虚拟地址空间中的区域与潜在的文件数据之间的关联,一个用于查找映射了给定文件区间的所有地址空间。




  • 如果一个新区域紧接着现存区域前后(或两个现存区域之间),内核将涉及的数据结构合并为一个。当然,前提是涉及的所有区域的访问权限是相同的,而且是从同一个后备存储器映射的连续数据
  • 如果在区域的开始或结束处进行删除,则必须截断现存的数据结构
  • 如果删除两个区域之间的一个区域,那么一方面需要减小现存数据结构的长度,另一方面需要为形成的新区域创建一个新的数据结构。


通过虚拟地址,find_vma可以找到满足 addrvm_end条件的第一个区域。

  1. struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr)
  2. {
  3. struct vm_area_struct *vma = NULL;
  4. if (mm) {
  5. /* Check the cache first. */
  6. /* (Cache hit rate is typically around 35%.) */
  7. vma = mm->mmap_cache;
  8. if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {
  9. struct rb_node * rb_node;
  10. rb_node = mm->mm_rb.rb_node;
  11. vma = NULL;
  12. while (rb_node) {
  13. struct vm_area_struct * vma_tmp;
  14. vma_tmp = rb_entry(rb_node,
  15. struct vm_area_struct, vm_rb);
  16. if (vma_tmp->vm_end > addr) {
  17. vma = vma_tmp;
  18. if (vma_tmp->vm_start <= addr)
  19. break;
  20. rb_node = rb_node->rb_left;
  21. } else
  22. rb_node = rb_node->rb_right;
  23. }
  24. if (vma)
  25. mm->mmap_cache = vma;
  26. }
  27. }
  28. return vma;
  29. }


  1. static inline struct vm_area_struct * find_vma_intersection(struct mm_struct *mm,unsigned long start_addr,unsigned long end_addr){
  2. struct vm_area_struct *vma = find_vma(mm,start_addr);
  3. if(vma && end_addr<=vma->vm_start)
  4. vma=NULL;
  5. return vma;
  6. }




  1. struct vm_area_struct *vma_merge(struct mm_struct *mm,
  2. struct vm_area_struct *prev, unsigned long addr,
  3. unsigned long end, unsigned long vm_flags,
  4. struct anon_vma *anon_vma, struct file *file,
  5. pgoff_t pgoff, struct mempolicy *policy)
  6. {
  7. pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
  8. struct vm_area_struct *area, *next;
  9. /*
  10. * We later require that vma->vm_flags == vm_flags,
  11. * so this tests vma->vm_flags & VM_SPECIAL, too.
  12. */
  13. if (vm_flags & VM_SPECIAL)
  14. return NULL;
  15. if (prev)
  16. next = prev->vm_next;
  17. else
  18. next = mm->mmap;
  19. area = next;
  20. if (next && next->vm_end == end) /* cases 6, 7, 8 */
  21. next = next->vm_next;
  22. /*
  23. * Can it merge with the predecessor?
  24. */
  25. if (prev && prev->vm_end == addr &&
  26. mpol_equal(vma_policy(prev), policy) &&
  27. can_vma_merge_after(prev, vm_flags,
  28. anon_vma, file, pgoff)) {//如果两个文件映射在地址空间中连续,但在文件中不连续,亦无法合并
  29. /*
  30. * OK, it can. Can we now merge in the successor as well?
  31. */
  32. if (next && end == next->vm_start &&
  33. mpol_equal(policy, vma_policy(next)) &&
  34. can_vma_merge_before(next, vm_flags,
  35. anon_vma, file, pgoff+pglen) &&
  36. is_mergeable_anon_vma(prev->anon_vma,
  37. next->anon_vma)) {//如果前一个和后一个区域都可以与当前区域合并,还必须确认前一个和后一个区域的匿名映射可以合并
  38. /* cases 1, 6 */
  39. vma_adjust(prev, prev->vm_start,
  40. next->vm_end, prev->vm_pgoff, NULL);
  41. } else /* cases 2, 5, 7 */
  42. vma_adjust(prev, prev->vm_start,
  43. end, prev->vm_pgoff, NULL);
  44. return prev;
  45. }
  46. /*
  47. * Can this new request be merged in front of next?
  48. */
  49. if (next && end == next->vm_start &&
  50. mpol_equal(policy, vma_policy(next)) &&
  51. can_vma_merge_before(next, vm_flags,
  52. anon_vma, file, pgoff+pglen)) {
  53. if (prev && addr < prev->vm_end) /* case 4 */
  54. vma_adjust(prev, prev->vm_start,
  55. addr, prev->vm_pgoff, NULL);
  56. else /* cases 3, 8 */
  57. vma_adjust(area, addr, next->vm_end,
  58. next->vm_pgoff - pglen, NULL);
  59. return area;
  60. }
  61. return NULL;
  62. }



  1. /* Insert vm structure into process list sorted by address
  2. * and into the inode's i_mmap tree. If vm_file is non-NULL
  3. * then i_mmap_lock is taken here.
  4. */
  5. int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
  6. {
  7. struct vm_area_struct * __vma, * prev;
  8. struct rb_node ** rb_link, * rb_parent;
  9. /*
  10. * The vm_pgoff of a purely anonymous vma should be irrelevant
  11. * until its first write fault, when page's anon_vma and index
  12. * are set. But now set the vm_pgoff it will almost certainly
  13. * end up with (unless mremap moves it elsewhere before that
  14. * first wfault), so /proc/pid/maps tells a consistent story.
  15. *
  16. * By setting it to reflect the virtual start address of the
  17. * vma, merges and splits can happen in a seamless way, just
  18. * using the existing file pgoff checks and manipulations.
  19. * Similarly in do_mmap_pgoff and in do_brk.
  20. */
  21. if (!vma->vm_file) {
  22. BUG_ON(vma->anon_vma);
  23. vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
  24. }
  25. __vma = find_vma_prepare(mm,vma->vm_start,&prev,&rb_link,&rb_parent);//获取prev(前一个区域),rb_parent(保存新区域的父节点),rb_link(包含该区域自身的叶节点),以及
  26. if (__vma && __vma->vm_start < vma->vm_end)
  27. return -ENOMEM;
  28. if ((vma->vm_flags & VM_ACCOUNT) &&
  29. security_vm_enough_memory_mm(mm, vma_pages(vma)))
  30. return -ENOMEM;
  31. vma_link(mm, vma, prev, rb_link, rb_parent);
  32. return 0;
  33. }


  • __vma_link_list将新区域放置到进程管理区域的线性链表上。完成该工作,只需提供使用 find_vma_prepare找到的前一个和后一个区域
  • __vma_link_rb将新区域连接到红黑树的数据结构中。
  • __anon_vma_link将vm_area_struct实例添加到匿名映射的链表,
  • __vma_link_file将相关的address_space和映射(如果是文件映射)关联起来,并使用 vma_prio_tree_insert将该区域添加到优先树中



  1. unsigned long
  2. arch_get_unmapped_area(struct file *filp, unsigned long addr,
  3. unsigned long len, unsigned long pgoff, unsigned long flags)
  4. {
  5. struct mm_struct *mm = current->mm;
  6. struct vm_area_struct *vma;
  7. unsigned long start_addr;
  8. if (len > TASK_SIZE)
  9. return -ENOMEM;
  10. if (flags & MAP_FIXED)
  11. return addr;
  12. if (addr) {
  13. addr = PAGE_ALIGN(addr);
  14. vma = find_vma(mm, addr);
  15. if (TASK_SIZE - len >= addr &&
  16. (!vma || addr + len <= vma->vm_start))
  17. return addr;
  18. }
  19. if (len > mm->cached_hole_size) {
  20. start_addr = addr = mm->free_area_cache;
  21. } else {
  22. start_addr = addr = TASK_UNMAPPED_BASE;
  23. mm->cached_hole_size = 0;
  24. }
  25. full_search:
  26. for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
  27. /* At this point: (!vma || addr < vma->vm_end). */
  28. if (TASK_SIZE - len < addr) {
  29. /*
  30. * Start a new search - just in case we missed
  31. * some holes.
  32. */
  33. if (start_addr != TASK_UNMAPPED_BASE) {
  34. addr = TASK_UNMAPPED_BASE;
  35. start_addr = addr;
  36. mm->cached_hole_size = 0;
  37. goto full_search;
  38. }
  39. return -ENOMEM;
  40. }
  41. if (!vma || addr + len <= vma->vm_start) {
  42. /*
  43. * Remember the place where we stopped the search:
  44. */
  45. mm->free_area_cache = addr + len;
  46. return addr;
  47. }
  48. if (addr + mm->cached_hole_size < vma->vm_start)
  49. mm->cached_hole_size = vma->vm_start - addr;
  50. addr = vma->vm_end;
  51. }
  52. }





  1. mm/filemap.c
  2. struct vm_operations_struct generic_file_vm_ops = {
  3. .fault = filemap_fault,
  4. };


  1. void *mmap(void *addr, size_t length, int prot, int flags,int fd, off_t offset);



  • 分配一个新的vm_area_struct实列,并插入到进程的链表/树数数据结构中
  • 用特定于文件的函数file->f_op->mmap创建映射。大多数文件系统将generic_file_mmap用于该目的。它所作的所有工作,就是将映射的vm_ops成员设置为generic_file_vm_ops。
  1. vma->vm_ops = &generic_file_vm_ops;


  • 如果设置了VM_LOCKED,或者通过系统调用的标志参数显式传递进来,或者通过mlockall机制隐式设置,内核都会调用make_pages_present依次扫描映射中各页,对每一页触发缺页异常以便读入 其数据






  • 所有建立的非线性映射的vm_area_struct实例维护在一个链表中,表头是struct address_space的i_mmap_nonlinear成员。链表中的各个vm_area_struct实例可以采用shared.vm_set.list作为链表元素,因为在标准的优先树中不存在非线性映射区域
  • 所述区域对应的页表项用一些特殊的项填充。这些页表项看起来像是对应于不存在的页,但其中包含附加信息,将其标识为非线性映射的页表项。在访问此类页表项描述的页时,会产生一个缺 页异常,并读入正确的页。(pgoff_to_pte将文件偏移量编码为页号,并将其编码为一种可以存储在页表中的格式。 pte_to_pgoff可以解码页表中存储的编码过的文件偏移量。 pte_file(pte)检查给定的页表项是否用于表示非线性映射。)








基于文件映射的页:void page_add_file_rmap(struct page *page)







  1. <arch\x86\mm\fault_32.c>
  2. fastcall void __kprobes do_page_fault(struct pt_regs *regs,
  3. unsigned long error_code)
  4. {
  5. struct task_struct *tsk;
  6. struct mm_struct *mm;
  7. struct vm_area_struct * vma;
  8. unsigned long address;
  9. int write, si_code;
  10. int fault;
  11. /*
  12. * We can fault from pretty much anywhere, with unknown IRQ state.
  13. */
  14. trace_hardirqs_fixup();
  15. /* get the address */
  16. address = read_cr2();//保存触发异常的地址
  17. tsk = current;
  18. si_code = SEGV_MAPERR;
  19. /*
  20. * We fault-in kernel-space virtual memory on-demand. The
  21. * 'reference' page table is init_mm.pgd.
  22. *
  23. * NOTE! We MUST NOT take any locks for this case. We may
  24. * be in an interrupt or a critical region, and should
  25. * only copy the information from the master page table,
  26. * nothing more.
  27. *
  28. * This verifies that the fault happens in kernel space
  29. * (error_code & 4) == 0, and that the fault was not a
  30. * protection error (error_code & 9) == 0.
  31. */
  32. /*我们因异常而进入到内核虚拟地址空间。
  33. *参考页表为init_mm.pgd。
  34. *
  35. *要注意!对这种情况我们不能获取任何锁。
  36. *我们可能是在中断或临界区中,
  37. *只应当从主页表复制信息,不允许其他操作
  38. *
  39. *下述代码验证了异常发生于内核空间(error_code&4) == 0,
  40. *而且异常不是保护错误(error_code & 9) == 0
  41. */
  42. if (unlikely(address >= TASK_SIZE)) {
  43. if (!(error_code & 0x0000000d) && vmalloc_fault(address) >= 0)//发生在核心态,且异常不是由保护错误触发时,内核使用vmalloc_fault同步页表(从init的页表复制相关的项到当前页表)
  44. return;
  45. if (notify_page_fault(regs))
  46. return;
  47. /*
  48. * Don't take the mm semaphore here. If we fixup a prefetch
  49. * fault we could otherwise deadlock.
  50. */
  51. /*不要在这里获取mm信号量。
  52. *如果修复了取指令造成的缺页异常,则会进入死锁。
  53. *
  54. */
  55. //如果异常是在中断期间或内核线程过程中触发,也没有自身的上下文因而也没有独立的mm_struct实列,则跳转
  56. goto bad_area_nosemaphore;
  57. }
  58. if (notify_page_fault(regs))
  59. return;
  60. /* It's safe to allow irq's after cr2 has been saved and the vmalloc
  61. fault has been handled. */
  62. if (regs->eflags & (X86_EFLAGS_IF|VM_MASK))
  63. local_irq_enable();
  64. mm = tsk->mm;
  65. /*
  66. * If we're in an interrupt, have no user context or are running in an
  67. * atomic region then we must not take the fault..
  68. */
  69. /*如果我们在中断期间,也没有用户上下文,或者代码处于原子操作范围内,则不能处理异常
  70. */
  71. if (in_atomic() || !mm)
  72. goto bad_area_nosemaphore;
  73. /* When running in the kernel we expect faults to occur only to
  74. * addresses in user space. All other faults represent errors in the
  75. * kernel and should generate an OOPS. Unfortunately, in the case of an
  76. * erroneous fault occurring in a code path which already holds mmap_sem
  77. * we will deadlock attempting to validate the fault against the
  78. * address space. Luckily the kernel only validly references user
  79. * space from well defined areas of code, which are listed in the
  80. * exceptions table.
  81. *
  82. * As the vast majority of faults will be valid we will only perform
  83. * the source reference check when there is a possibility of a deadlock.
  84. * Attempt to lock the address space, if we cannot we then validate the
  85. * source. If this is invalid we can skip the address space check,
  86. * thus avoiding the deadlock.
  87. */
  88. if (!down_read_trylock(&mm->mmap_sem)) {
  89. if ((error_code & 4) == 0 &&
  90. !search_exception_tables(regs->eip))
  91. goto bad_area_nosemaphore;
  92. down_read(&mm->mmap_sem);
  93. }
  94. //如果异常并非出现在中断期间,也有相关的上下文,则内核检查进程的地址空间是否包含异常地址所在区域
  95. vma = find_vma(mm, address);
  96. if (!vma)
  97. goto bad_area;
  98. if (vma->vm_start <= address)
  99. goto good_area;
  100. if (!(vma->vm_flags & VM_GROWSDOWN))
  101. goto bad_area;
  102. if (error_code & 4) {
  103. /*
  104. * Accessing the stack below %esp is always a bug.
  105. * The large cushion allows instructions like enter
  106. * and pusha to work. ("enter $65535,$31" pushes
  107. * 32 pointers and then decrements %esp by 65535.)
  108. */
  109. if (address + 65536 + 32 * sizeof(unsigned long) < regs->esp)
  110. goto bad_area;
  111. }
  112. if (expand_stack(vma, address))//增大栈
  113. goto bad_area;
  114. /*
  115. * Ok, we have a good vm_area for this memory access, so
  116. * we can handle it..
  117. */
  118. good_area:
  119. si_code = SEGV_ACCERR;
  120. write = 0;
  121. switch (error_code & 3) {
  122. default: /* 3: write, present *///写,不缺页
  123. /* fall through */
  124. case 2: /* write, not present *///写,缺页
  125. if (!(vma->vm_flags & VM_WRITE))
  126. goto bad_area;
  127. write++;
  128. break;
  129. case 1: /* read, present *///读,不缺页
  130. goto bad_area;
  131. case 0: /* read, not present *///读,缺页
  132. if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
  133. goto bad_area;
  134. }
  135. survive:
  136. /*
  137. * If for any reason at all we couldn't handle the fault,
  138. * make sure we exit gracefully rather than endlessly redo
  139. * the fault.
  140. */
  141. /*
  142. 如果由于某些原因我们无法处理异常,则必须优雅地推出,而不是一直重试。
  143. */
  144. fault = handle_mm_fault(mm, vma, address, write);//修正缺页异常(按需调入、换入等),返回值VM_FAULT_MINOR:数据已经在内存中,VM_FAULT_MAJOR:数据需要从块设备读取
  145. if (unlikely(fault & VM_FAULT_ERROR)) {
  146. if (fault & VM_FAULT_OOM)//内存不足
  147. goto out_of_memory;
  148. else if (fault & VM_FAULT_SIGBUS)//其他原因,发送信号给进程
  149. goto do_sigbus;
  150. BUG();
  151. }
  152. if (fault & VM_FAULT_MAJOR)
  153. tsk->maj_flt++;
  154. else
  155. tsk->min_flt++;
  156. /*
  157. * Did it hit the DOS screen memory VA from vm86 mode?
  158. */
  159. if (regs->eflags & VM_MASK) {
  160. unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
  161. if (bit < 32)
  162. tsk->thread.screen_bitmap |= 1 << bit;
  163. }
  164. up_read(&mm->mmap_sem);
  165. return;
  166. /*
  167. * Something tried to access memory that isn't in our memory map..
  168. * Fix it, but check if it's kernel or user first..
  169. */
  170. bad_area:
  171. up_read(&mm->mmap_sem);
  172. bad_area_nosemaphore:
  173. /* User mode accesses just cause a SIGSEGV */
  174. if (error_code & 4) {//用户态的访问导致了SIGSEGV(返回段错误)
  175. /*
  176. * It's possible to have interrupts off here.
  177. */
  178. local_irq_enable();
  179. /*
  180. * Valid to do another page fault here because this one came
  181. * from user space.
  182. */
  183. if (is_prefetch(regs, address, error_code))
  184. return;
  185. if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
  186. printk_ratelimit()) {
  187. printk("%s%s[%d]: segfault at %08lx eip %08lx "
  188. "esp %08lx error %lx\n",
  189. task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
  190. tsk->comm, task_pid_nr(tsk), address, regs->eip,
  191. regs->esp, error_code);
  192. }
  193. tsk->thread.cr2 = address;
  194. /* Kernel addresses are always protection faults */
  195. tsk->thread.error_code = error_code | (address >= TASK_SIZE);
  196. tsk->thread.trap_no = 14;
  197. force_sig_info_fault(SIGSEGV, si_code, address, tsk);
  198. return;
  199. }
  200. #ifdef CONFIG_X86_F00F_BUG
  201. /*
  202. * Pentium F0 0F C7 C8 bug workaround.
  203. */
  204. if (boot_cpu_data.f00f_bug) {
  205. unsigned long nr;
  206. nr = (address - idt_descr.address) >> 3;
  207. if (nr == 6) {
  208. do_invalid_op(regs, 0);
  209. return;
  210. }
  211. }
  212. #endif
  213. no_context:
  214. /* Are we prepared to handle this kernel fault? */
  215. /* 准备好处理这个内核异常了吗?(内核空间)(做最后的校正尝试)*/
  216. if (fixup_exception(regs))
  217. return;
  218. /*
  219. * Valid to do another page fault here, because if this fault
  220. * had been triggered by is_prefetch fixup_exception would have
  221. * handled it.
  222. */
  223. if (is_prefetch(regs, address, error_code))
  224. return;
  225. /*
  226. * Oops. The kernel tried to access some bad page. We'll have to
  227. * terminate things with extreme prejudice.
  228. */
  229. bust_spinlocks(1);
  230. //oops
  231. if (oops_may_print()) {
  232. __typeof__(pte_val(__pte(0))) page;
  233. #ifdef CONFIG_X86_PAE
  234. if (error_code & 16) {
  235. pte_t *pte = lookup_address(address);
  236. if (pte && pte_present(*pte) && !pte_exec_kernel(*pte))
  237. printk(KERN_CRIT "kernel tried to execute "
  238. "NX-protected page - exploit attempt? "
  239. "(uid: %d)\n", current->uid);
  240. }
  241. #endif
  242. if (address < PAGE_SIZE)
  243. printk(KERN_ALERT "BUG: unable to handle kernel NULL "
  244. "pointer dereference");
  245. else
  246. printk(KERN_ALERT "BUG: unable to handle kernel paging"
  247. " request");
  248. printk(" at virtual address %08lx\n",address);
  249. printk(KERN_ALERT "printing eip: %08lx ", regs->eip);
  250. page = read_cr3();
  251. page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT];
  252. #ifdef CONFIG_X86_PAE
  253. printk("*pdpt = %016Lx ", page);
  254. if ((page >> PAGE_SHIFT) < max_low_pfn
  255. && page & _PAGE_PRESENT) {
  256. page &= PAGE_MASK;
  257. page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT)
  258. & (PTRS_PER_PMD - 1)];
  259. printk(KERN_CONT "*pde = %016Lx ", page);
  260. page &= ~_PAGE_NX;
  261. }
  262. #else
  263. printk("*pde = %08lx ", page);
  264. #endif
  265. /*
  266. * We must not directly access the pte in the highpte
  267. * case if the page table is located in highmem.
  268. * And let's rather not kmap-atomic the pte, just in case
  269. * it's allocated already.
  270. */
  271. if ((page >> PAGE_SHIFT) < max_low_pfn
  272. && (page & _PAGE_PRESENT)
  273. && !(page & _PAGE_PSE)) {
  274. page &= PAGE_MASK;
  275. page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT)
  276. & (PTRS_PER_PTE - 1)];
  277. printk("*pte = %0*Lx ", sizeof(page)*2, (u64)page);
  278. }
  279. printk("\n");
  280. }
  281. tsk->thread.cr2 = address;
  282. tsk->thread.trap_no = 14;
  283. tsk->thread.error_code = error_code;
  284. die("Oops", regs, error_code);
  285. bust_spinlocks(0);
  286. do_exit(SIGKILL);
  287. /*
  288. * We ran out of memory, or some other thing happened to us that made
  289. * us unable to handle the page fault gracefully.
  290. */
  291. out_of_memory:
  292. up_read(&mm->mmap_sem);
  293. if (is_global_init(tsk)) {
  294. yield();
  295. down_read(&mm->mmap_sem);
  296. goto survive;
  297. }
  298. printk("VM: killing process %s\n", tsk->comm);
  299. if (error_code & 4)
  300. do_group_exit(SIGKILL);
  301. goto no_context;
  302. do_sigbus:
  303. up_read(&mm->mmap_sem);
  304. /* Kernel mode? Handle exceptions or die */
  305. if (!(error_code & 4))
  306. goto no_context;
  307. /* User space => ok to do another page fault */
  308. if (is_prefetch(regs, address, error_code))
  309. return;
  310. tsk->thread.cr2 = address;
  311. tsk->thread.error_code = error_code;
  312. tsk->thread.trap_no = 14;
  313. force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
  314. }



  1. <mm/memory.c>
  2. static inline int handle_pte_fault(struct mm_struct *mm,
  3. struct vm_area_struct *vma, unsigned long address,
  4. pte_t *pte, pmd_t *pmd, int write_access)
  5. {
  6. pte_t entry;
  7. spinlock_t *ptl;
  8. entry = *pte;
  9. if (!pte_present(entry)) {//如果页不在物理内存中
  10. if (pte_none(entry)) {//没有对应的页表项
  11. if (vma->vm_ops) {//基于文件的映射,按需调页
  12. if (vma->vm_ops->fault || vma->vm_ops->nopage)
  13. return do_linear_fault(mm, vma, address,
  14. pte, pmd, write_access, entry);
  15. if (unlikely(vma->vm_ops->nopfn))
  16. return do_no_pfn(mm, vma, address, pte,
  17. pmd, write_access);
  18. }
  19. return do_anonymous_page(mm, vma, address,
  20. pte, pmd, write_access);//匿名页:按需分配
  21. }
  22. if (pte_file(entry))
  23. return do_nonlinear_fault(mm, vma, address,
  24. pte, pmd, write_access, entry);//换入非线性映射
  25. return do_swap_page(mm, vma, address,
  26. pte, pmd, write_access, entry);//换入
  27. }
  28. ptl = pte_lockptr(mm, pmd);
  29. spin_lock(ptl);
  30. if (unlikely(!pte_same(*pte, entry)))
  31. goto unlock;
  32. if (write_access) {//如果该区域对页授予了写权限,而硬件的存储机制没有授予,则COW
  33. if (!pte_write(entry))
  34. return do_wp_page(mm, vma, address,
  35. pte, pmd, ptl, entry);
  36. entry = pte_mkdirty(entry);
  37. }
  38. entry = pte_mkyoung(entry);
  39. if (ptep_set_access_flags(vma, address, pte, entry, write_access)) {
  40. update_mmu_cache(vma, address, entry);
  41. } else {
  42. /*
  43. * This is needed only for protection faults but the arch code
  44. * is not yet telling us if this is a protection fault or not.
  45. * This still avoids useless tlb flushes for .text page faults
  46. * with threads.
  47. */
  48. if (write_access)
  49. flush_tlb_page(vma, address);
  50. }
  51. unlock:
  52. pte_unmap_unlock(pte, ptl);
  53. return 0;
  54. }


  1. static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
  2. unsigned long address, pmd_t *pmd,
  3. pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
  4. {
  5. pte_t *page_table;
  6. spinlock_t *ptl;
  7. struct page *page;
  8. pte_t entry;
  9. int anon = 0;
  10. struct page *dirty_page = NULL;
  11. struct vm_fault vmf;
  12. int ret;
  13. int page_mkwrite = 0;
  14. vmf.virtual_address = (void __user *)(address & PAGE_MASK);
  15. vmf.pgoff = pgoff;
  16. vmf.flags = flags;
  17. vmf.page = NULL;
  18. BUG_ON(vma->vm_flags & VM_PFNMAP);
  19. //将所需数据读入到发生异常的页(内核使用address_space对象中的信息,从后备存储器将数据读取到物理内存页)
  20. if (likely(vma->vm_ops->fault)) {
  21. ret = vma->vm_ops->fault(vma, &vmf);
  22. if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
  23. return ret;
  24. } else {
  25. /* Legacy ->nopage path */
  26. ret = 0;
  27. vmf.page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret);
  28. /* no page was available -- either SIGBUS or OOM */
  29. if (unlikely(vmf.page == NOPAGE_SIGBUS))
  30. return VM_FAULT_SIGBUS;
  31. else if (unlikely(vmf.page == NOPAGE_OOM))
  32. return VM_FAULT_OOM;
  33. }
  34. /*
  35. * For consistency in subsequent calls, make the faulted page always
  36. * locked.
  37. */
  38. if (unlikely(!(ret & VM_FAULT_LOCKED)))
  39. lock_page(vmf.page);
  40. else
  41. VM_BUG_ON(!PageLocked(vmf.page));
  42. /*
  43. * Should we do an early C-O-W break?
  44. */
  45. page = vmf.page;
  46. if (flags & FAULT_FLAG_WRITE) {
  47. if (!(vma->vm_flags & VM_SHARED)) {//私有映射
  48. anon = 1;
  49. if (unlikely(anon_vma_prepare(vma))) {
  50. ret = VM_FAULT_OOM;
  51. goto out;
  52. }
  53. page = alloc_page_vma(GFP_HIGHUSER_MOVABLE,
  54. vma, address);
  55. if (!page) {
  56. ret = VM_FAULT_OOM;
  57. goto out;
  58. }
  59. copy_user_highpage(page, vmf.page, address, vma);
  60. } else {//共享映射
  61. /*
  62. * If the page will be shareable, see if the backing
  63. * address space wants to know that the page is about
  64. * to become writable
  65. */
  66. if (vma->vm_ops->page_mkwrite) {
  67. unlock_page(page);
  68. if (vma->vm_ops->page_mkwrite(vma, page) < 0) {
  69. ret = VM_FAULT_SIGBUS;
  70. anon = 1; /* no anon but release vmf.page */
  71. goto out_unlocked;
  72. }
  73. lock_page(page);
  74. /*
  75. * XXX: this is not quite right (racy vs
  76. * invalidate) to unlock and relock the page
  77. * like this, however a better fix requires
  78. * reworking page_mkwrite locking API, which
  79. * is better done later.
  80. */
  81. if (!page->mapping) {
  82. ret = 0;
  83. anon = 1; /* no anon but release vmf.page */
  84. goto out;
  85. }
  86. page_mkwrite = 1;
  87. }
  88. }
  89. }
  90. page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
  91. /*
  92. * This silly early PAGE_DIRTY setting removes a race
  93. * due to the bad i386 page protection. But it's valid
  94. * for other architectures too.
  95. *
  96. * Note that if write_access is true, we either now have
  97. * an exclusive copy of the page, or this is a shared mapping,
  98. * so we can make it writable and dirty to avoid having to
  99. * handle that later.
  100. */
  101. /* Only go through if we didn't race with anybody else... */
  102. if (likely(pte_same(*page_table, orig_pte))) {
  103. flush_icache_page(vma, page);
  104. entry = mk_pte(page, vma->vm_page_prot);
  105. if (flags & FAULT_FLAG_WRITE)
  106. entry = maybe_mkwrite(pte_mkdirty(entry), vma);
  107. set_pte_at(mm, address, page_table, entry);
  108. if (anon) {
  109. inc_mm_counter(mm, anon_rss);
  110. lru_cache_add_active(page);
  111. page_add_new_anon_rmap(page, vma, address);//建立匿名页反向映射
  112. } else {
  113. inc_mm_counter(mm, file_rss);
  114. page_add_file_rmap(page);//建立映射页反向映射
  115. if (flags & FAULT_FLAG_WRITE) {
  116. dirty_page = page;
  117. get_page(dirty_page);
  118. }
  119. }
  120. /* no need to invalidate: a not-present page won't be cached */
  121. update_mmu_cache(vma, address, entry);
  122. } else {
  123. if (anon)
  124. page_cache_release(page);
  125. else
  126. anon = 1; /* no anon but release faulted_page */
  127. }
  128. pte_unmap_unlock(page_table, ptl);
  129. out:
  130. unlock_page(vmf.page);
  131. out_unlocked:
  132. if (anon)
  133. page_cache_release(vmf.page);
  134. else if (dirty_page) {
  135. if (vma->vm_file)
  136. file_update_time(vma->vm_file);
  137. set_page_dirty_balance(dirty_page, page_mkwrite);
  138. put_page(dirty_page);
  139. }
  140. return ret;
  141. }


  • 使用vm_area_struct->vm_file找到映射的file对象
  • 在file->f_mapping中找到指向映射自身的指针
  • 每个地址空间都有特定的地址空间操作,从中选择readpage方法




首先调用vm_normal_page,找到struct page实例。







在处理不是由于访问vmalloc区域导致的缺页异常时,异常修正(exception fixup)机制是一个最后手段。

每次发生缺页异常时,将输出异常的原因和当前执行代码的地址。这使得内核可以编译一个列表, 列出所有可能执行未授权内存访问操作的危险代码块。这个“异常表”在链接内核映像时创建,在二 进制文件中位于__start_exception_table和__end_exception_table之间。每个表项都对应于一 个struct exception_table实例,该结构尽管是体系结构相关的,

  1. <include/asm-x86/uaccess_32.h>
  2. struct exception_table_entry {
  3. unsigned long insn, fixup;
  4. };


  1. arch/x86/mm/extable_32.c
  2. int fixup_exception(struct pt_regs *regs) {
  3. const struct exception_table_entry *fixup;
  4. fixup = search_exception_tables(regs->eip);
  5. if (fixup) {
  6. regs->eip = fixup->fixup;
  7. return 1;
  8. }
  9. return 0;
  10. }




3.利用异常校正(exception fixup)机制修复坏指针。


  1. Linux内核入门到放弃-进程管理和调度-《深入Linux内核架构》笔记

    进程优先级 硬实时进程 软实时进程 普通进程 O(1)调度.完全公平调度器 抢占式多任务处理(preemptive multitasking):各个进程都分配到一定的时间段可以执行.时间段到期后,内核 ...

  2. Linux内核入门到放弃-网络-《深入Linux内核架构》笔记

    网络命名空间 struct net { atomic_t count; /* To decided when the network * namespace should be freed. */ a ...

  3. Linux内核入门到放弃-模块-《深入Linux内核架构》笔记

    使用模块 依赖关系 modutils标准工具集中的depmod工具可用于计算系统的各个模块之间的依赖关系.每次系统启动时或新模块安装后,通常都会运行该程序.找到的依赖关系保存在一个列表中.默认情况下, ...

  4. Linux从入门到放弃、零基础入门Linux(第三篇):在虚拟机vmware中安装linux(二)超详细手把手教你安装centos6分步图解

    一.继续在vmware中安装centos6.9 本次安装是进行最小化安装,即没有图形化界面的安装,如果是新手,建议安装带图形化界面的centos, 具体参考Linux从入门到放弃.零基础入门Linux ...

  5. Linux从入门到放弃、零基础入门Linux(第四篇):在虚拟机vmware中安装centos7.7

    如果是新手,建议安装带图形化界面的centos,这里以安装centos7.7的64位为例 一.下载系统镜像 镜像文件下载链接https://wiki.centos.org/Download 阿里云官网 ...

  6. Linux内核入门到放弃-页面回收和页交换-《深入Linux内核架构》笔记

    概述 可换出页 只有少量几种页可以换出到交换区,对其他页来说,换出到块设备上与之对应的后备存储器即可,如下所述. 类别为 MAP_ANONYMOUS 的页,没有关联到文件,例如,这可能是进程的栈或是使 ...

  7. Linux内核入门到放弃-内核活动-《深入Linux内核架构》笔记

    中断 中断类型 同步中断和异常.这些由CPU自身产生,针对当前执行的程序 异步中断.这是经典的中断类型,由外部设备产生,可能发生在任意时间. 在退出中断中,内核会检查下列事项. 调度器是否应该选择一个 ...

  8. Linux内核入门到放弃-内存管理-《深入Linux内核架构》笔记

    概述 内存管理的实现涵盖了许多领域: 内存中的物理内存页管理 分配大块内存的伙伴系统 分配较小内存块的slab.slub和slob分配器 分配非连续内存块的vmalloc机制 进程的地址空间 在IA- ...

  9. Linux内核入门到放弃-无持久存储的文件系统-《深入Linux内核架构》笔记

    proc文件系统 proc文件系统是一种虚拟的文件系统,其信息不能从块设备读取.只有在读取文件内容时,才动态生成相应的信息. /proc的内容 内存管理 系统进程的特征数据 文件系统 设备驱动程序 系 ...


  1. 在centos7上编译安装nginx

    题前,先放一个有图有真相的博客链接:https://www.cnblogs.com/zhang-shijie/p/5294162.html 虽然别人说的很详细,但还是记录一下 1.VMWare Wor ...

  2. Android SharedPreferences增,删,查操作

    SharedPreferences是Android平台上一个轻量级的存储类,用来保存应用的一些常用配置,比如Activity状态,Activity暂停时,将此activity的状态保存到SharedP ...

  3. Android加载图片的策略

    实现图片缓存也不难,需要有相应的cache策略.这里我采用 内存-文件-网络 三层cache机制,其中内存缓存包括强引用缓存和软引用缓存(SoftReference),其实网络不算cache,这里姑且 ...

  4. java 大文件分割与组装

    不多说,直接上代码 1 import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; im ...

  5. git 入门教程之紧急修复

    和往常一样,每个人团队开发者都在自己的本地分支上进行日常工作,相互独立又相互联系,一直以来相安无事,可是某天下午,上级领导突然急冲冲的打电话告诉你线上出bug了,需要你紧急修复,下班之前必须解决! 我 ...

  6. 关于linux 安装libxml2

    安装php的时候提示libxml2 未安装 从服务器安装libxml2 提示 libxml.c:3821: error: expected '=', ',', ';', 'asm' or '__att ...

  7. Android项目的targetSDK>=23,在低于Android6.0的部分测试机(类似华为)上运行时出现的系统权限问题

    相信大家对Android6.0以上的动态权限已经有所了解,很多童鞋也已经跃跃欲试地将自己项目的targetSDK升级到了23及其以上,很不幸的是我也成为了其中一员,然而我还是图样图森破了,升级之后的问 ...

  8. centos下mysql授予权限提示ERROR 1133 (42000): Can't find any matching row in the user table

    错误: 给mysql对应的用户授予权限的时候提示报错: 解决方法: 后面才知道原来是同事这边新增了用户没有flush grant all privileges on *.* to 'user'@'%' ...

  9. 几个常用dos网络命令

    ping www.baidu.com 测试网络的同时,查看ip地址 1. 如图:百度的ip为浏览器直接输入ip即可进入百度首页. 另外还有, ...

  10. Markdown语法大全

    目录 前言: 1.Markdown基础用法 1.1 目录 1.2 标题 1.3 字体样式 1.4 引用 1.5 图片 1.6 超链接 1.7 列表 1.8 表格 1.9 代码 1.10 流程图 1.1 ...