slub分配object
kmem_cache如下:
62struct kmem_cache {
struct kmem_cache_cpu __percpu *cpu_slab;
/* Used for retriving partial slabs etc */
unsigned long flags;
unsigned long min_partial;
int size; /* The size of an object including meta data */
int object_size; /* The size of an object without meta data */
int offset; /* Free pointer offset. */
int cpu_partial; /* Number of per cpu partial objects to keep around */
struct kmem_cache_order_objects oo; /* Allocation and freeing of slabs */
struct kmem_cache_order_objects max;
struct kmem_cache_order_objects min;
gfp_t allocflags; /* gfp flags to use on each alloc */
int refcount; /* Refcount for slab cache destroy */
void (*ctor)(void *);
int inuse; /* Offset to metadata */
int align; /* Alignment */
int reserved; /* Reserved bytes at the end of slabs */
const char *name; /* Name (only for display!) */
struct list_head list; /* List of slab caches */
int red_left_pad; /* Left redzone padding size */
#ifdef CONFIG_SYSFS
struct kobject kobj; /* For sysfs */
#endif
#ifdef CONFIG_MEMCG_KMEM
struct memcg_cache_params memcg_params;
int max_attr_size; /* for propagation, maximum size of a stored attr */
#ifdef CONFIG_SYSFS
struct kset *memcg_kset;
#endif
#endif #ifdef CONFIG_NUMA
/*
98 * Defragmentation by allocating from a remote node.
99 */
int remote_node_defrag_ratio;
#endif #ifdef CONFIG_KASAN
struct kasan_cache kasan_info;
#endif struct kmem_cache_node *node[MAX_NUMNODES];
};
kmem_cache_cpu定义如下:
40struct kmem_cache_cpu {
void **freelist; /* Pointer to next available object */
unsigned long tid; /* Globally unique transaction id */
struct page *page; /* The slab from which we are allocating */
struct page *partial; /* Partially allocated frozen slabs */
#ifdef CONFIG_SLUB_STATS
unsigned stat[NR_SLUB_STAT_ITEMS];
#endif
};
kmem_cache_node定义如下:
326struct kmem_cache_node {
spinlock_t list_lock; #ifdef CONFIG_SLAB
struct list_head slabs_partial; /* partial list first, better asm code */
struct list_head slabs_full;
struct list_head slabs_free;
unsigned long free_objects;
unsigned int free_limit;
unsigned int colour_next; /* Per-node cache coloring */
struct array_cache *shared; /* shared per node */
struct alien_cache **alien; /* on other nodes */
unsigned long next_reap; /* updated without locking */
int free_touched; /* updated without locking */
#endif #ifdef CONFIG_SLUB
unsigned long nr_partial;
struct list_head partial;
#ifdef CONFIG_SLUB_DEBUG
atomic_long_t nr_slabs;
atomic_long_t total_objects;
struct list_head full;
#endif
#endif };
总的来说,slub分配object,先从c->freelist找,如果为空,再从c->page里transfer object到freelist(get_freelist).如果依然找不着,从c->partail里找。再找不着,从node->partail里找。再找不着,从相邻的numa节点找,再找不着,申请一个新的slab.
咱们跟着代码来过一下这个流程。
2668static __always_inline void *slab_alloc_node(struct kmem_cache *s,
gfp_t gfpflags, int node, unsigned long addr)
{
void *object;
struct kmem_cache_cpu *c;
struct page *page;
unsigned long tid; s = slab_pre_alloc_hook(s, gfpflags);
if (!s)
return NULL;
2679redo:
/*
2681 * Must read kmem_cache cpu data via this cpu ptr. Preemption is
2682 * enabled. We may switch back and forth between cpus while
2683 * reading from one cpu area. That does not matter as long
2684 * as we end up on the original cpu again when doing the cmpxchg.
2685 *
2686 * We should guarantee that tid and kmem_cache are retrieved on
2687 * the same cpu. It could be different if CONFIG_PREEMPT so we need
2688 * to check if it is matched or not.
2689 */
do {
tid = this_cpu_read(s->cpu_slab->tid);
c = raw_cpu_ptr(s->cpu_slab);
} while (IS_ENABLED(CONFIG_PREEMPT) &&
unlikely(tid != READ_ONCE(c->tid))); /*
2697 * Irqless object alloc/free algorithm used here depends on sequence
2698 * of fetching cpu_slab's data. tid should be fetched before anything
2699 * on c to guarantee that object and page associated with previous tid
2700 * won't be used with current tid. If we fetch tid first, object and
2701 * page could be one associated with next tid and our alloc/free
2702 * request will be failed. In this case, we will retry. So, no problem.
2703 */
barrier(); /*
2707 * The transaction ids are globally unique per cpu and per operation on
2708 * a per cpu queue. Thus they can be guarantee that the cmpxchg_double
2709 * occurs on the right processor and that there was no operation on the
2710 * linked list in between.
2711 */ object = c->freelist;
page = c->page;
if (unlikely(!object || !node_match(page, node))) {
object = __slab_alloc(s, gfpflags, node, addr, c);
stat(s, ALLOC_SLOWPATH);
} else {
void *next_object = get_freepointer_safe(s, object);
2690~2695保证tid和cpu_slab是同一个cpu的。2704行考虑到了cpu缓存一致性的问题,加了内存屏障。2715行如果c->freelist为空,调__slab_alloc, __slab_alloc会禁本地cpu中断,再调___slab_alloc:
2537static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
unsigned long addr, struct kmem_cache_cpu *c)
{
void *freelist;
struct page *page; page = c->page;
if (!page)
goto new_slab;
2546redo:
...
/* must check again c->freelist in case of cpu migration or IRQ */
freelist = c->freelist;
if (freelist)
goto load_freelist; freelist = get_freelist(s, page); if (!freelist) {
c->page = NULL;
stat(s, DEACTIVATE_BYPASS);
goto new_slab;
} stat(s, ALLOC_REFILL); 2590load_freelist:
/*
2592 * freelist is pointing to the list of objects to be used.
2593 * page is pointing to the page from which the objects are obtained.
2594 * That page must be frozen for per cpu allocations to work.
2595 */
VM_BUG_ON(!c->page->frozen);
c->freelist = get_freepointer(s, freelist);
c->tid = next_tid(c->tid);
return freelist; 2601new_slab: if (c->partial) {
page = c->page = c->partial;
c->partial = page->next;
stat(s, CPU_PARTIAL_ALLOC);
c->freelist = NULL;
goto redo;
} freelist = new_slab_objects(s, gfpflags, node, &c); if (unlikely(!freelist)) {
slab_out_of_memory(s, gfpflags, node);
return NULL;
} page = c->page;
if (likely(!kmem_cache_debug(s) && pfmemalloc_match(page, gfpflags)))
goto load_freelist; /* Only entered in the debug case */
if (kmem_cache_debug(s) &&
!alloc_debug_processing(s, page, freelist, addr))
goto new_slab; /* Slab failed checks. Next slab needed */ deactivate_slab(s, page, get_freepointer(s, freelist));
c->page = NULL;
c->freelist = NULL;
return freelist;
}
___slab_alloc是分配的主流程,因为进程迁移和cpu中断的原因再次判断c->freelist是否有object。没有,通过get_freelist把c->page->freelist置为NULL,并把该page的frozen和inuse置位。inuse置为page->objects,直到该slab用完。
get_freelist实现如下:
2494static inline void *get_freelist(struct kmem_cache *s, struct page *page)
{
struct page new;
unsigned long counters;
void *freelist; do {
freelist = page->freelist;
counters = page->counters; new.counters = counters;
VM_BUG_ON(!new.frozen); new.inuse = page->objects;
new.frozen = freelist != NULL; } while (!__cmpxchg_double_slab(s, page,
freelist, counters,
NULL, new.counters,
"get_freelist")); return freelist;
}
如果c->page也没有找到object,跳到new_slab:处。首先检查c->partial是否有object,如果有,会把freelist指到c->partial这个page.若没有,走到new_slab_objects:
2442static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags,
int node, struct kmem_cache_cpu **pc)
{
void *freelist;
struct kmem_cache_cpu *c = *pc;
struct page *page; freelist = get_partial(s, flags, node, c); if (freelist)
return freelist; page = new_slab(s, flags, node);
if (page) {
c = raw_cpu_ptr(s->cpu_slab);
if (c->page)
flush_slab(s, c); /*
2461 * No other reference to the page yet so we can
2462 * muck around with it freely without cmpxchg
2463 */
freelist = page->freelist;
page->freelist = NULL; stat(s, ALLOC_SLAB);
c->page = page;
*pc = c;
} else
freelist = NULL; return freelist;
}
get_partial先从该cache匹配的node去找,如果找不到,再从其他相邻node找object.get_partial_node:
1845static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
struct kmem_cache_cpu *c, gfp_t flags)
{
struct page *page, *page2;
void *object = NULL;
int available = ;
int objects; /*
1854 * Racy check. If we mistakenly see no partial slabs then we
1855 * just allocate an empty slab. If we mistakenly try to get a
1856 * partial slab and there is none available then get_partials()
1857 * will return NULL.
1858 */
if (!n || !n->nr_partial)
return NULL; spin_lock(&n->list_lock);
list_for_each_entry_safe(page, page2, &n->partial, lru) {
void *t; if (!pfmemalloc_match(page, flags))
continue; t = acquire_slab(s, n, page, object == NULL, &objects);
if (!t)
break; available += objects;
if (!object) {
c->page = page;
stat(s, ALLOC_FROM_PARTIAL);
object = t;
} else {
put_cpu_partial(s, page, );
stat(s, CPU_PARTIAL_NODE);
}
if (!kmem_cache_has_cpu_partial(s)
|| available > s->cpu_partial / )
break; }
spin_unlock(&n->list_lock);
return object;
}
该函数会从node的partial链表去连续取一些slab,然后freeze这些slab,把这些slab从node的partial链表里删除,并把这些slab移到c->partail链表。直到可用的objects个数 > s->cpu_partial / 2.
如果没找到,就去相邻node去找。这块暂不叙述。
acquire_slab:
1799static inline void *acquire_slab(struct kmem_cache *s,
struct kmem_cache_node *n, struct page *page,
int mode, int *objects)
{
void *freelist;
unsigned long counters;
struct page new; lockdep_assert_held(&n->list_lock); /*
1810 * Zap the freelist and set the frozen bit.
1811 * The old freelist is the list of objects for the
1812 * per cpu allocation list.
1813 */
freelist = page->freelist;
counters = page->counters;
new.counters = counters;
*objects = new.objects - new.inuse;
if (mode) {
new.inuse = page->objects;
new.freelist = NULL;
} else {
new.freelist = freelist;
} VM_BUG_ON(new.frozen);
new.frozen = ; if (!__cmpxchg_double_slab(s, page,
freelist, counters,
new.freelist, new.counters,
"acquire_slab"))
return NULL; remove_partial(n, page);
WARN_ON(!freelist);
return freelist;
}
这里有一个小细节,如果object为NULL时(mode == true),会把该page->freelist置为NULL,表示正在使用,而如果已经找到了object,则会在put_cpu_partial里把该page->next指向之前c->partial.链表不会断。
put_cpu_partial:
2265static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
{
#ifdef CONFIG_SLUB_CPU_PARTIAL
struct page *oldpage;
int pages;
int pobjects; preempt_disable();
do {
pages = ;
pobjects = ;
oldpage = this_cpu_read(s->cpu_slab->partial); if (oldpage) {
pobjects = oldpage->pobjects;
pages = oldpage->pages;
if (drain && pobjects > s->cpu_partial) {
unsigned long flags;
/*
2284 * partial array is full. Move the existing
2285 * set to the per node partial list.
2286 */
local_irq_save(flags);
unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
local_irq_restore(flags);
oldpage = NULL;
pobjects = ;
pages = ;
stat(s, CPU_PARTIAL_DRAIN);
}
} pages++;
pobjects += page->objects - page->inuse; page->pages = pages;
page->pobjects = pobjects;
page->next = oldpage; } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page)
!= oldpage);
if (unlikely(!s->cpu_partial)) {
unsigned long flags; local_irq_save(flags);
unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
local_irq_restore(flags);
}
preempt_enable();
#endif
}
如果从node中未找到可用的object,则会申请内存,创建一个新的slab.
1581static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
{
struct page *page;
struct kmem_cache_order_objects oo = s->oo;
gfp_t alloc_gfp;
void *start, *p;
int idx, order; flags &= gfp_allowed_mask; if (gfpflags_allow_blocking(flags))
local_irq_enable(); flags |= s->allocflags; /*
1597 * Let the initial higher-order allocation fail under memory pressure
1598 * so we fall-back to the minimum order allocation.
1599 */
alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL;
if ((alloc_gfp & __GFP_DIRECT_RECLAIM) && oo_order(oo) > oo_order(s->min))
alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~__GFP_DIRECT_RECLAIM; page = alloc_slab_page(s, alloc_gfp, node, oo);
if (unlikely(!page)) {
oo = s->min;
alloc_gfp = flags;
/*
1609 * Allocation may have failed due to fragmentation.
1610 * Try a lower order alloc if possible
1611 */
page = alloc_slab_page(s, alloc_gfp, node, oo);
if (unlikely(!page))
goto out;
stat(s, ORDER_FALLBACK);
} ...
page->objects = oo_objects(oo); order = compound_order(page);
page->slab_cache = s;
__SetPageSlab(page);
if (page_is_pfmemalloc(page))
SetPageSlabPfmemalloc(page); start = page_address(page); if (unlikely(s->flags & SLAB_POISON))
memset(start, POISON_INUSE, PAGE_SIZE << order); kasan_poison_slab(page); for_each_object_idx(p, idx, s, start, page->objects) {
setup_object(s, page, p);
if (likely(idx < page->objects))
set_freepointer(s, p, p + s->size);
else
set_freepointer(s, p, NULL);
} page->freelist = fixup_red_left(s, start);
page->inuse = page->objects;
page->frozen = ;
out:
if (gfpflags_allow_blocking(flags))
local_irq_disable();
if (!page)
return NULL; mod_zone_page_state(page_zone(page),
(s->flags & SLAB_RECLAIM_ACCOUNT) ?
NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
<< oo_order(oo)); inc_slabs_node(s, page_to_nid(page), page->objects); return page;
}
先申请order个页,然后freeze该slab,并初始化该slab的object(在该object的offset处填写下一个可用的object的地址)。并把该c->page置为刚申请的page,然后把page->freelist置为NULL,以示在使用。
至此,slub分配object完。
slub分配object的更多相关文章
- slab分配object
在numa架构下,slab分配object: 3192static __always_inline void * 3193__do_cache_alloc(struct kmem_cache *cac ...
- 图解slub
1.前言 在Linux中,伙伴系统(buddy system)是以页为单位管理和分配内存.但是现实的需求却以字节为单位,假如我们需要申请20Bytes,总不能分配一页吧!那岂不是严重浪费内存.那么该如 ...
- 内存管理(1)-buddy和slub算法
Linux内存管理是一个很复杂的系统,也是linux的精髓之一,网络上讲解这方面的文档也很多,我把这段时间学习内存管理方面的知识记录在这里,涉及的代码太多,也没有太多仔细的去看代码,深入解算法,这篇文 ...
- slub
1.前言 在Linux中,伙伴系统(buddy system)是以页为单位管理和分配内存.但是现实的需求却以字节为单位,假如我们需要申请20Bytes,总不能分配一页吧!那岂不是严重浪费内存.那么该如 ...
- KASAN实现原理【转】
1. 前言 KASAN是一个动态检测内存错误的工具.KASAN可以检测全局变量.栈.堆分配的内存发生越界访问等问题.功能比SLUB DEBUG齐全并且支持实时检测.越界访问的严重性和危害性通过我之前的 ...
- linux内存源码分析 - 内存回收(匿名页反向映射)
本文为原创,转载请注明:http://www.cnblogs.com/tolimit/ 概述 看完了内存压缩,最近在看内存回收这块的代码,发现内容有些多,需要分几块去详细说明,首先先说说匿名页的反向映 ...
- KASAN实现原理
1. 前言 KASAN是一个动态检测内存错误的工具.KASAN可以检测全局变量.栈.堆分配的内存发生越界访问等问题.功能比SLUB DEBUG功能齐全并且支持实时检测.越界访问的严重性和危害性通过我之 ...
- Linux-3.14.12内存管理笔记【kmalloc与kfree实现】【转】
本文转载自:http://blog.chinaunix.net/uid-26859697-id-5573776.html kmalloc()是基于slab/slob/slub分配分配算法上实现的,不少 ...
- ucoreOS_lab2 实验报告
所有的实验报告将会在 Github 同步更新,更多内容请移步至Github:https://github.com/AngelKitty/review_the_national_post-graduat ...
随机推荐
- 蓝牙App漏洞系列分析之三CVE-2017-0645
蓝牙App漏洞系列分析之三CVE-2017-0645 0x01 漏洞简介 Android 6月的安全公告,同时还修复了我们发现的一个蓝牙 App 提权中危漏洞,该漏洞允许手机本地无权限的恶意程序构造一 ...
- C语言特殊函数的应用
1. va_list相关函数的学习: va_list是一种变参量的指针类型定义. va_list使用方法如下: 1)首先在函数中定义一个具有va_list型的变量,这个变量是指向参数的指针. 2)首先 ...
- shell常用分隔符及管道的用法
1.命令1;命令2;命令3;.... 代码顺序执行 2.&&连接两条命令:命令1&&命令2&&命令3... 短路执行 3.||连接两条命令:命令1||命 ...
- python常有模块:模块、引入语法、两种执行方式、模块搜索顺序
今天主要讲了以下几点:一.模块三问.定义及分类二.import和from的语法三.文件的两种执行方式及搜索顺序四.内置函数 一.模块.import和from的语法 1.什么是模块 模块是一堆功能函 ...
- js excel导出功能
<html> <head> <p style="font-size: 20px;color: red;">使用a标签方式将json导出csv文件 ...
- Python:面向对象编程2
types.MethodType __slot__ @property, @xxx.setter Python的多重继承和MinIn 如何在class创建后,给实例绑定属性和方法? (动态绑定/定义 ...
- UVa400 Unix is
The computer company you work for is introducing a brand new computer line and is developing a new U ...
- java 如何实现文件变动的监听
获取修改时间 long lastTime = file.lastModified(); 原文链接:https://blog.csdn.net/liuyueyi25/article/details/79 ...
- java.io.WinNTFileSystem
Unicode-aware FileSystem for Windows NT/2000. Since: 1.4 Author: Konstantin Kladko
- Tomcat与Jetty比较
Jetty 基本架构 Jetty目前的是一个比较被看好的 Servlet 引擎,它的架构比较简单,也是一个可扩展性和非常灵活的应用服务器. 它有一个基本数据模型,这个数据模型就是 Handler(处理 ...