用户空间缺页异常pte_handle_fault()分析--(下)--写时复制【转】

转自：http://blog.csdn.net/vanbreaker/article/details/7955713

在pte_handle_fault()中，如果触发异常的页存在于主存中，那么该异常往往是由写了一个只读页触发的，此时需要进行COW(写时复制操作)。如当一个父进程通过fork()创建了一个子进程时，子进程将会共享父进程的页框。之后，无论是父进程还是子进程要对相应的内存进行写操作，都要进行COW，也就是为自己重新分配一个页框，并把之前的数据复制到页框中去，再写。

static inline int handle_pte_fault(struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long address,
pte_t *pte, pmd_t *pmd, unsigned int flags)
{
pte_t entry;
spinlock_t *ptl;
entry = *pte;
...
...
...
/********页在主存中的情况***********/
ptl = pte_lockptr(mm, pmd);
spin_lock(ptl);
if (unlikely(!pte_same(*pte, entry)))
goto unlock;
if (flags & FAULT_FLAG_WRITE) {//异常由写访问触发
if (!pte_write(entry))//而对应的页是不可写的
return do_wp_page(mm, vma, address, //此时必须进行写时复制的操作
pte, pmd, ptl, entry);
entry = pte_mkdirty(entry);
}
entry = pte_mkyoung(entry);
if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) {
update_mmu_cache(vma, address, entry);
} else {
/*
* This is needed only for protection faults but the arch code
* is not yet telling us if this is a protection fault or not.
* This still avoids useless tlb flushes for .text page faults
* with threads.
*/
if (flags & FAULT_FLAG_WRITE)
flush_tlb_page(vma, address);
}
unlock:
pte_unmap_unlock(pte, ptl);
return 0;
}

可以看到，hand_pte_fault()函数处理页存在于主存中的情况的关键操作都集中在do_wp_page()函数上。该函数是用来处理COW的，不过在COW之前先要做一些检查，比如说，如果对应的页只有一个进程使用，那么便可以直接修改页的权限为可读可写，而不进行COW。总之，不到不得以的情况下是不会进行COW的。

static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *page_table, pmd_t *pmd,
spinlock_t *ptl, pte_t orig_pte)
{
struct page *old_page, *new_page;
pte_t entry;
int reuse = 0, ret = 0;
int page_mkwrite = 0;
struct page *dirty_page = NULL;
old_page = vm_normal_page(vma, address, orig_pte);//获取共享页
if (!old_page) {//获取共享页失败
/*
* VM_MIXEDMAP !pfn_valid() case
*
* We should not cow pages in a shared writeable mapping.
* Just mark the pages writable as we can't do any dirty
* accounting on raw pfn maps.
*/
/*如果vma的映射本来就是共享且可写的，则跳转至reuse直接使用orig_pte对应的页*/
if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
(VM_WRITE|VM_SHARED))
goto reuse;
/*否则跳转至gotten分配一个页*/
goto gotten;
}
/*
* Take out anonymous pages first, anonymous shared vmas are
* not dirty accountable.
*/
/*下面首先判断匿名页的情况，如果old_page是匿名页，并且只有一个进程使用它(reuse为1)，则
则直接使用该页*/
if (PageAnon(old_page) && !PageKsm(old_page)) {
/*这里先判断是否有其他进程竞争，修改了页表*/
if (!trylock_page(old_page)) {
page_cache_get(old_page);
pte_unmap_unlock(page_table, ptl);
lock_page(old_page);
page_table = pte_offset_map_lock(mm, pmd, address,
&ptl);
if (!pte_same(*page_table, orig_pte)) {
unlock_page(old_page);
page_cache_release(old_page);
goto unlock;
}
page_cache_release(old_page);
}
/*确定没有其他进程竞争，则进行reuse判断，通过reuse_swap_page()函数判断
old_page的_mapcount字段是否为0，是的话则表明只有一个进程使用该匿名页*/
reuse = reuse_swap_page(old_page);
unlock_page(old_page);
} else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
(VM_WRITE|VM_SHARED))) {//如果vma的映射本来就是共享且可写的
/*
* Only catch write-faults on shared writable pages,
* read-only shared pages can get COWed by
* get_user_pages(.write=1, .force=1).
*/
if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
struct vm_fault vmf;
int tmp;
vmf.virtual_address = (void __user *)(address &
PAGE_MASK);
vmf.pgoff = old_page->index;
vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
vmf.page = old_page;
/*
* Notify the address space that the page is about to
* become writable so that it can prohibit this or wait
* for the page to get into an appropriate state.
*
* We do this without the lock held, so that it can
* sleep if it needs to.
*/
page_cache_get(old_page);//增加old_page的引用计数作为保护
pte_unmap_unlock(page_table, ptl);
/*这里通知即将修改页的权限*/
tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
/*如果无法修改的话，则跳转到unwritable_page*/
if (unlikely(tmp &
(VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
ret = tmp;
goto unwritable_page;
}
if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
lock_page(old_page);
if (!old_page->mapping) {
ret = 0; /* retry the fault */
unlock_page(old_page);
goto unwritable_page;
}
} else
VM_BUG_ON(!PageLocked(old_page));
/*
* Since we dropped the lock we need to revalidate
* the PTE as someone else may have changed it. If
* they did, we just return, as we can count on the
* MMU to tell us if they didn't also make it writable.
*/
/*走到这里表示已经成功修改了页的权限了，这里同样重新获取页表，判断是否和之前一致*/
page_table = pte_offset_map_lock(mm, pmd, address,
&ptl);
if (!pte_same(*page_table, orig_pte)) {
unlock_page(old_page);
page_cache_release(old_page);
goto unlock;
}
page_mkwrite = 1;
}
dirty_page = old_page;
get_page(dirty_page);
reuse = 1;
}
if (reuse) {//reuse处理，也就是说不进行COW，可以直接在old_page上进行写操作
reuse:
flush_cache_page(vma, address, pte_pfn(orig_pte));
entry = pte_mkyoung(orig_pte);//标记_PAGE_ACCESSED位
entry = maybe_mkwrite(pte_mkdirty(entry), vma);//将页的权限修改为可读可写，并且标记为脏页
if (ptep_set_access_flags(vma, address, page_table, entry,1))
update_mmu_cache(vma, address, entry);
ret |= VM_FAULT_WRITE;
goto unlock;
}
/*
* Ok, we need to copy. Oh, well..
*/
/***************终于走到了不得已的一步了，下面只好进行COW了********************/
page_cache_get(old_page);
gotten:
pte_unmap_unlock(page_table, ptl);
if (unlikely(anon_vma_prepare(vma)))
goto oom;
if (is_zero_pfn(pte_pfn(orig_pte))) {
new_page = alloc_zeroed_user_highpage_movable(vma, address);//分配一个零页面
if (!new_page)
goto oom;
} else {
new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);//分配一个非零页面
if (!new_page)
goto oom;
cow_user_page(new_page, old_page, address, vma);//将old_page中的数据拷贝到new_page
}
__SetPageUptodate(new_page);
/*
* Don't let another task, with possibly unlocked vma,
* keep the mlocked page.
*/
if ((vma->vm_flags & VM_LOCKED) && old_page) {
lock_page(old_page); /* for LRU manipulation */
clear_page_mlock(old_page);
unlock_page(old_page);
}
if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
goto oom_free_new;
/*
* Re-check the pte - we dropped the lock
*/
page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
if (likely(pte_same(*page_table, orig_pte))) {
if (old_page) {
if (!PageAnon(old_page)) {
dec_mm_counter(mm, file_rss);
inc_mm_counter(mm, anon_rss);
}
} else
inc_mm_counter(mm, anon_rss);
flush_cache_page(vma, address, pte_pfn(orig_pte));
entry = mk_pte(new_page, vma->vm_page_prot);//获取new_page的pte
entry = maybe_mkwrite(pte_mkdirty(entry), vma);//修改new_page的权限
/*
* Clear the pte entry and flush it first, before updating the
* pte with the new entry. This will avoid a race condition
* seen in the presence of one thread doing SMC and another
* thread doing COW.
*/
ptep_clear_flush(vma, address, page_table);
page_add_new_anon_rmap(new_page, vma, address);
/*
* We call the notify macro here because, when using secondary
* mmu page tables (such as kvm shadow page tables), we want the
* new page to be mapped directly into the secondary page table.
*/
set_pte_at_notify(mm, address, page_table, entry);
update_mmu_cache(vma, address, entry);
if (old_page) {
/*
* Only after switching the pte to the new page may
* we remove the mapcount here. Otherwise another
* process may come and find the rmap count decremented
* before the pte is switched to the new page, and
* "reuse" the old page writing into it while our pte
* here still points into it and can be read by other
* threads.
*
* The critical issue is to order this
* page_remove_rmap with the ptp_clear_flush above.
* Those stores are ordered by (if nothing else,)
* the barrier present in the atomic_add_negative
* in page_remove_rmap.
*
* Then the TLB flush in ptep_clear_flush ensures that
* no process can access the old page before the
* decremented mapcount is visible. And the old page
* cannot be reused until after the decremented
* mapcount is visible. So transitively, TLBs to
* old page will be flushed before it can be reused.
*/
page_remove_rmap(old_page);
}
/* Free the old page.. */
new_page = old_page;
ret |= VM_FAULT_WRITE;
} else
mem_cgroup_uncharge_page(new_page);
if (new_page)
page_cache_release(new_page);
if (old_page)
page_cache_release(old_page);
unlock:
pte_unmap_unlock(page_table, ptl);
if (dirty_page) {
/*
* Yes, Virginia, this is actually required to prevent a race
* with clear_page_dirty_for_io() from clearing the page dirty
* bit after it clear all dirty ptes, but before a racing
* do_wp_page installs a dirty pte.
*
* do_no_page is protected similarly.
*/
if (!page_mkwrite) {
wait_on_page_locked(dirty_page);
set_page_dirty_balance(dirty_page, page_mkwrite);
}
put_page(dirty_page);
if (page_mkwrite) {
struct address_space *mapping = dirty_page->mapping;
set_page_dirty(dirty_page);
unlock_page(dirty_page);
page_cache_release(dirty_page);
if (mapping) {
/*
* Some device drivers do not set page.mapping
* but still dirty their pages
*/
balance_dirty_pages_ratelimited(mapping);
}
}
/* file_update_time outside page_lock */
if (vma->vm_file)
file_update_time(vma->vm_file);
}
return ret;
oom_free_new:
page_cache_release(new_page);
oom:
if (old_page) {
if (page_mkwrite) {
unlock_page(old_page);
page_cache_release(old_page);
}
page_cache_release(old_page);
}
return VM_FAULT_OOM;
unwritable_page:
page_cache_release(old_page);
return ret;
}

用户空间缺页异常pte_handle_fault()分析--(下)--写时复制【转】的更多相关文章

用户空间缺页异常pte_handle_fault()分析--(上)【转】
转自:http://blog.csdn.net/vanbreaker/article/details/7881206 版权声明:本文为博主原创文章,未经博主允许不得转载. 前面简单的分析了内核处理用户 ...
fork()和写时复制
写时复制技术最初产生于Unix系统,用于实现一种傻瓜式的进程创建:当发出fork( )系统调用时,内核原样复制父进程的整个地址空间并把复制的那一份分配给子进程.这种行为是非常耗时的,因为它需要: · ...
Linux的fork()写时复制原则（转）
写时复制技术最初产生于Unix系统,用于实现一种傻瓜式的进程创建:当发出fork( )系统调用时,内核原样复制父进程的整个地址空间并把复制的那一份分配给子进程.这种行为是非常耗时的,因为它需要: · ...
Linux进程管理——fork()和写时复制
写时复制技术最初产生于Unix系统,用于实现一种傻瓜式的进程创建:当发出fork( )系统调用时,内核原样复制父进程的整个地址空间并把复制的那一份分配给子进程.这种行为是非常耗时的,因为它需要: · ...
写时复制和fork,vfork,clone
写时复制原理: 用了“引用计数”,会有一个变量用于保存引用的数量.当第一个类构造时,string的构造函数会根据传入的参数从堆上分配内存,当有其它类需要这块内存时,这个计数为自动累加,当有类析构时, ...
Redis持久化之父子进程与写时复制
之所以将Linux底层的写时复制技术放在Redis篇幅下,是因为Redis进行RDB持久化时,BGSAVE(后面称之为"后台保存")会开辟一个子进程,将数据从内存写进磁盘,这儿我产 ...
php 垃圾回收机制----写时复制和引用计数
PHP使用引用计数和写时复制来管理内存.写时复制保证了变量间复制值不浪费内存,引用计数保证了当变量不再需要时,将内存释放给操作系统. 要理解PHP内存管理,首先要理解一个概念----符号表. 符号表的 ...
[转]QVector与QByteArray——Qt的写时复制(copy on write)技术
我们在之前的博文QVector的内存分配策略与再谈QVector与std::vector——使用装饰者让std::vector支持连续赋值中简单聊了聊QVector内存分配和赋值方面的一点东西,今天接 ...
Java进阶知识点6：并发容器背后的设计理念 - 锁分段、写时复制和弱一致性
一.背景容器是Java编程中使用频率很高的组件,但Java默认提供的基本容器(ArrayList,HashMap等)均不是线程安全的.当容器和多线程并发编程相遇时,程序员又该何去何从呢? 通常有两种 ...

随机推荐

移动端的拖拽排序在react中实现了解一下
最近做一个拖拽排序的功能找了好几个有一个步骤简单,结合redux最好不过了,话不多说上代码第一步: npm install react-draggable-tags --save 第二步 sort. ...
笔记-DB-mongodb-常用操作-1
笔记-DB-mongodb-常用操作-1 1. 启动及连接 1.1. 启动启动mongod windows下: 1. 如已添加服务 net start <service name> ...
5，pandas高级数据处理
1.删除重复元素使用duplicated()函数检测重复的行,返回元素为布尔类型的Series对象,每个元素对应一行,如果该行不是第一次出现,则元素为True - keep参数:指定保留哪一重复的行 ...
Android ANR详解
如何避免KeyDispatchTimeout 1:UI线程尽量只做跟UI相关的工作 2:耗时的工作(比如数据库操作,I/O,连接网络或者别的有可能阻碍UI线程的操作)把它放入单独的线程处理 3:尽量用 ...
斐波那契数列（递归）&求100以内的素数
Java 5 添加了 java.util.Scanner 类,这是一个用于扫描输入文本的新的实用程序.它是以前的 StringTokenizer 和 Matcher 类之间的某种结合.由于任何数据都 ...
linux 多播
1.概念单播是用于两个主机之间传送数据,广播是一个主机对局域网内的所有主机发送数据.而多播,又称为组播,它是对一组特定的主机通信.将网络上同一类型业务逻辑上分组,只和组内的成员通信,其它主机没有加 ...
剑指Offer - 九度1512 - 用两个栈实现队列
剑指Offer - 九度1512 - 用两个栈实现队列2013-11-29 21:23 题目描述: 用两个栈来实现一个队列,完成队列的Push和Pop操作.队列中的元素为int类型. 输入: 每个输入 ...
windows 10的资源管理器不显示映射的网络驱动器怎么办？
最近在使用映射网络驱动器的时候出现一个奇怪的现象.事情源于我在资源管理器里面映射了来自多个不同账号的网络驱动器.使用的是win10系统.映射不同账号的网络驱动器是不允许的.于是只能删掉其他账号和凭证重 ...
Box布局管理
创建wx.BoxSizer对象时可以指定布局方向: hbox = wx.BoxSizer(wx.HORIZONTAL) 设置为水平方向 hbox = wx.BoxSizer() 默认就是就是水平方向的 ...
python中os.path.join和join的区别
这两个函数都是python的系统函数,都有“组合”.“连接”之意,但用法和应用场景千差万别函数说明: 1.join函数用法:用于连接字符串数组.将字符串.元组.列表中的元素以指定的字符(即分隔符) ...

用户空间缺页异常pte_handle_fault()分析--(下)--写时复制【转】

用户空间缺页异常pte_handle_fault()分析--(下)--写时复制【转】的更多相关文章

随机推荐

热门专题