kmem_cache如下:

62struct kmem_cache {
struct kmem_cache_cpu __percpu *cpu_slab;
/* Used for retriving partial slabs etc */
unsigned long flags;
unsigned long min_partial;
int size; /* The size of an object including meta data */
int object_size; /* The size of an object without meta data */
int offset; /* Free pointer offset. */
int cpu_partial; /* Number of per cpu partial objects to keep around */
struct kmem_cache_order_objects oo; /* Allocation and freeing of slabs */
struct kmem_cache_order_objects max;
struct kmem_cache_order_objects min;
gfp_t allocflags; /* gfp flags to use on each alloc */
int refcount; /* Refcount for slab cache destroy */
void (*ctor)(void *);
int inuse; /* Offset to metadata */
int align; /* Alignment */
int reserved; /* Reserved bytes at the end of slabs */
const char *name; /* Name (only for display!) */
struct list_head list; /* List of slab caches */
int red_left_pad; /* Left redzone padding size */
#ifdef CONFIG_SYSFS
struct kobject kobj; /* For sysfs */
#endif
#ifdef CONFIG_MEMCG_KMEM
struct memcg_cache_params memcg_params;
int max_attr_size; /* for propagation, maximum size of a stored attr */
#ifdef CONFIG_SYSFS
struct kset *memcg_kset;
#endif
#endif #ifdef CONFIG_NUMA
/*
98 * Defragmentation by allocating from a remote node.
99 */
int remote_node_defrag_ratio;
#endif #ifdef CONFIG_KASAN
struct kasan_cache kasan_info;
#endif struct kmem_cache_node *node[MAX_NUMNODES];
};

kmem_cache_cpu定义如下:

40struct kmem_cache_cpu {
void **freelist; /* Pointer to next available object */
unsigned long tid; /* Globally unique transaction id */
struct page *page; /* The slab from which we are allocating */
struct page *partial; /* Partially allocated frozen slabs */
#ifdef CONFIG_SLUB_STATS
unsigned stat[NR_SLUB_STAT_ITEMS];
#endif
};

kmem_cache_node定义如下:

326struct kmem_cache_node {
spinlock_t list_lock; #ifdef CONFIG_SLAB
struct list_head slabs_partial; /* partial list first, better asm code */
struct list_head slabs_full;
struct list_head slabs_free;
unsigned long free_objects;
unsigned int free_limit;
unsigned int colour_next; /* Per-node cache coloring */
struct array_cache *shared; /* shared per node */
struct alien_cache **alien; /* on other nodes */
unsigned long next_reap; /* updated without locking */
int free_touched; /* updated without locking */
#endif #ifdef CONFIG_SLUB
unsigned long nr_partial;
struct list_head partial;
#ifdef CONFIG_SLUB_DEBUG
atomic_long_t nr_slabs;
atomic_long_t total_objects;
struct list_head full;
#endif
#endif };

总的来说,slub分配object,先从c->freelist找,如果为空,再从c->page里transfer object到freelist(get_freelist).如果依然找不着,从c->partail里找。再找不着,从node->partail里找。再找不着,从相邻的numa节点找,再找不着,申请一个新的slab.

咱们跟着代码来过一下这个流程。

2668static __always_inline void *slab_alloc_node(struct kmem_cache *s,
gfp_t gfpflags, int node, unsigned long addr)
{
void *object;
struct kmem_cache_cpu *c;
struct page *page;
unsigned long tid; s = slab_pre_alloc_hook(s, gfpflags);
if (!s)
return NULL;
2679redo:
/*
2681 * Must read kmem_cache cpu data via this cpu ptr. Preemption is
2682 * enabled. We may switch back and forth between cpus while
2683 * reading from one cpu area. That does not matter as long
2684 * as we end up on the original cpu again when doing the cmpxchg.
2685 *
2686 * We should guarantee that tid and kmem_cache are retrieved on
2687 * the same cpu. It could be different if CONFIG_PREEMPT so we need
2688 * to check if it is matched or not.
2689 */
do {
tid = this_cpu_read(s->cpu_slab->tid);
c = raw_cpu_ptr(s->cpu_slab);
} while (IS_ENABLED(CONFIG_PREEMPT) &&
unlikely(tid != READ_ONCE(c->tid))); /*
2697 * Irqless object alloc/free algorithm used here depends on sequence
2698 * of fetching cpu_slab's data. tid should be fetched before anything
2699 * on c to guarantee that object and page associated with previous tid
2700 * won't be used with current tid. If we fetch tid first, object and
2701 * page could be one associated with next tid and our alloc/free
2702 * request will be failed. In this case, we will retry. So, no problem.
2703 */
barrier(); /*
2707 * The transaction ids are globally unique per cpu and per operation on
2708 * a per cpu queue. Thus they can be guarantee that the cmpxchg_double
2709 * occurs on the right processor and that there was no operation on the
2710 * linked list in between.
2711 */ object = c->freelist;
page = c->page;
if (unlikely(!object || !node_match(page, node))) {
object = __slab_alloc(s, gfpflags, node, addr, c);
stat(s, ALLOC_SLOWPATH);
} else {
void *next_object = get_freepointer_safe(s, object);

2690~2695保证tid和cpu_slab是同一个cpu的。2704行考虑到了cpu缓存一致性的问题,加了内存屏障。2715行如果c->freelist为空,调__slab_alloc, __slab_alloc会禁本地cpu中断,再调___slab_alloc:

2537static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
unsigned long addr, struct kmem_cache_cpu *c)
{
void *freelist;
struct page *page; page = c->page;
if (!page)
goto new_slab;
2546redo:

     ...
/* must check again c->freelist in case of cpu migration or IRQ */
freelist = c->freelist;
if (freelist)
goto load_freelist; freelist = get_freelist(s, page); if (!freelist) {
c->page = NULL;
stat(s, DEACTIVATE_BYPASS);
goto new_slab;
} stat(s, ALLOC_REFILL); 2590load_freelist:
/*
2592 * freelist is pointing to the list of objects to be used.
2593 * page is pointing to the page from which the objects are obtained.
2594 * That page must be frozen for per cpu allocations to work.
2595 */
VM_BUG_ON(!c->page->frozen);
c->freelist = get_freepointer(s, freelist);
c->tid = next_tid(c->tid);
return freelist; 2601new_slab: if (c->partial) {
page = c->page = c->partial;
c->partial = page->next;
stat(s, CPU_PARTIAL_ALLOC);
c->freelist = NULL;
goto redo;
} freelist = new_slab_objects(s, gfpflags, node, &c); if (unlikely(!freelist)) {
slab_out_of_memory(s, gfpflags, node);
return NULL;
} page = c->page;
if (likely(!kmem_cache_debug(s) && pfmemalloc_match(page, gfpflags)))
goto load_freelist; /* Only entered in the debug case */
if (kmem_cache_debug(s) &&
!alloc_debug_processing(s, page, freelist, addr))
goto new_slab; /* Slab failed checks. Next slab needed */ deactivate_slab(s, page, get_freepointer(s, freelist));
c->page = NULL;
c->freelist = NULL;
return freelist;
}
___slab_alloc是分配的主流程,因为进程迁移和cpu中断的原因再次判断c->freelist是否有object。没有,通过get_freelist把c->page->freelist置为NULL,并把该page的frozen和inuse置位。inuse置为page->objects,直到该slab用完。
get_freelist实现如下:
2494static inline void *get_freelist(struct kmem_cache *s, struct page *page)
{
struct page new;
unsigned long counters;
void *freelist; do {
freelist = page->freelist;
counters = page->counters; new.counters = counters;
VM_BUG_ON(!new.frozen); new.inuse = page->objects;
new.frozen = freelist != NULL; } while (!__cmpxchg_double_slab(s, page,
freelist, counters,
NULL, new.counters,
"get_freelist")); return freelist;
}
如果c->page也没有找到object,跳到new_slab:处。首先检查c->partial是否有object,如果有,会把freelist指到c->partial这个page.若没有,走到new_slab_objects:
2442static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags,
int node, struct kmem_cache_cpu **pc)
{
void *freelist;
struct kmem_cache_cpu *c = *pc;
struct page *page; freelist = get_partial(s, flags, node, c); if (freelist)
return freelist; page = new_slab(s, flags, node);
if (page) {
c = raw_cpu_ptr(s->cpu_slab);
if (c->page)
flush_slab(s, c); /*
2461 * No other reference to the page yet so we can
2462 * muck around with it freely without cmpxchg
2463 */
freelist = page->freelist;
page->freelist = NULL; stat(s, ALLOC_SLAB);
c->page = page;
*pc = c;
} else
freelist = NULL; return freelist;
}
get_partial先从该cache匹配的node去找,如果找不到,再从其他相邻node找object.get_partial_node
1845static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
struct kmem_cache_cpu *c, gfp_t flags)
{
struct page *page, *page2;
void *object = NULL;
int available = ;
int objects; /*
1854 * Racy check. If we mistakenly see no partial slabs then we
1855 * just allocate an empty slab. If we mistakenly try to get a
1856 * partial slab and there is none available then get_partials()
1857 * will return NULL.
1858 */
if (!n || !n->nr_partial)
return NULL; spin_lock(&n->list_lock);
list_for_each_entry_safe(page, page2, &n->partial, lru) {
void *t; if (!pfmemalloc_match(page, flags))
continue; t = acquire_slab(s, n, page, object == NULL, &objects);
if (!t)
break; available += objects;
if (!object) {
c->page = page;
stat(s, ALLOC_FROM_PARTIAL);
object = t;
} else {
put_cpu_partial(s, page, );
stat(s, CPU_PARTIAL_NODE);
}
if (!kmem_cache_has_cpu_partial(s)
|| available > s->cpu_partial / )
break; }
spin_unlock(&n->list_lock);
return object;
}

该函数会从node的partial链表去连续取一些slab,然后freeze这些slab,把这些slab从node的partial链表里删除,并把这些slab移到c->partail链表。直到可用的objects个数 > s->cpu_partial / 2.

如果没找到,就去相邻node去找。这块暂不叙述。

acquire_slab:

1799static inline void *acquire_slab(struct kmem_cache *s,
struct kmem_cache_node *n, struct page *page,
int mode, int *objects)
{
void *freelist;
unsigned long counters;
struct page new; lockdep_assert_held(&n->list_lock); /*
1810 * Zap the freelist and set the frozen bit.
1811 * The old freelist is the list of objects for the
1812 * per cpu allocation list.
1813 */
freelist = page->freelist;
counters = page->counters;
new.counters = counters;
*objects = new.objects - new.inuse;
if (mode) {
new.inuse = page->objects;
new.freelist = NULL;
} else {
new.freelist = freelist;
} VM_BUG_ON(new.frozen);
new.frozen = ; if (!__cmpxchg_double_slab(s, page,
freelist, counters,
new.freelist, new.counters,
"acquire_slab"))
return NULL; remove_partial(n, page);
WARN_ON(!freelist);
return freelist;
}

这里有一个小细节,如果object为NULL时(mode == true),会把该page->freelist置为NULL,表示正在使用,而如果已经找到了object,则会在put_cpu_partial里把该page->next指向之前c->partial.链表不会断。

put_cpu_partial:
2265static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
{
#ifdef CONFIG_SLUB_CPU_PARTIAL
struct page *oldpage;
int pages;
int pobjects; preempt_disable();
do {
pages = ;
pobjects = ;
oldpage = this_cpu_read(s->cpu_slab->partial); if (oldpage) {
pobjects = oldpage->pobjects;
pages = oldpage->pages;
if (drain && pobjects > s->cpu_partial) {
unsigned long flags;
/*
2284 * partial array is full. Move the existing
2285 * set to the per node partial list.
2286 */
local_irq_save(flags);
unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
local_irq_restore(flags);
oldpage = NULL;
pobjects = ;
pages = ;
stat(s, CPU_PARTIAL_DRAIN);
}
} pages++;
pobjects += page->objects - page->inuse; page->pages = pages;
page->pobjects = pobjects;
page->next = oldpage; } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page)
!= oldpage);
if (unlikely(!s->cpu_partial)) {
unsigned long flags; local_irq_save(flags);
unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
local_irq_restore(flags);
}
preempt_enable();
#endif
}

如果从node中未找到可用的object,则会申请内存,创建一个新的slab.

1581static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
{
struct page *page;
struct kmem_cache_order_objects oo = s->oo;
gfp_t alloc_gfp;
void *start, *p;
int idx, order; flags &= gfp_allowed_mask; if (gfpflags_allow_blocking(flags))
local_irq_enable(); flags |= s->allocflags; /*
1597 * Let the initial higher-order allocation fail under memory pressure
1598 * so we fall-back to the minimum order allocation.
1599 */
alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL;
if ((alloc_gfp & __GFP_DIRECT_RECLAIM) && oo_order(oo) > oo_order(s->min))
alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~__GFP_DIRECT_RECLAIM; page = alloc_slab_page(s, alloc_gfp, node, oo);
if (unlikely(!page)) {
oo = s->min;
alloc_gfp = flags;
/*
1609 * Allocation may have failed due to fragmentation.
1610 * Try a lower order alloc if possible
1611 */
page = alloc_slab_page(s, alloc_gfp, node, oo);
if (unlikely(!page))
goto out;
stat(s, ORDER_FALLBACK);
}      ...
page->objects = oo_objects(oo); order = compound_order(page);
page->slab_cache = s;
__SetPageSlab(page);
if (page_is_pfmemalloc(page))
SetPageSlabPfmemalloc(page); start = page_address(page); if (unlikely(s->flags & SLAB_POISON))
memset(start, POISON_INUSE, PAGE_SIZE << order); kasan_poison_slab(page); for_each_object_idx(p, idx, s, start, page->objects) {
setup_object(s, page, p);
if (likely(idx < page->objects))
set_freepointer(s, p, p + s->size);
else
set_freepointer(s, p, NULL);
} page->freelist = fixup_red_left(s, start);
page->inuse = page->objects;
page->frozen = ;
out:
if (gfpflags_allow_blocking(flags))
local_irq_disable();
if (!page)
return NULL; mod_zone_page_state(page_zone(page),
(s->flags & SLAB_RECLAIM_ACCOUNT) ?
NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
<< oo_order(oo)); inc_slabs_node(s, page_to_nid(page), page->objects); return page;
}

先申请order个页,然后freeze该slab,并初始化该slab的object(在该object的offset处填写下一个可用的object的地址)。并把该c->page置为刚申请的page,然后把page->freelist置为NULL,以示在使用。

至此,slub分配object完。


 

slub分配object的更多相关文章

  1. slab分配object

    在numa架构下,slab分配object: 3192static __always_inline void * 3193__do_cache_alloc(struct kmem_cache *cac ...

  2. 图解slub

    1.前言 在Linux中,伙伴系统(buddy system)是以页为单位管理和分配内存.但是现实的需求却以字节为单位,假如我们需要申请20Bytes,总不能分配一页吧!那岂不是严重浪费内存.那么该如 ...

  3. 内存管理(1)-buddy和slub算法

    Linux内存管理是一个很复杂的系统,也是linux的精髓之一,网络上讲解这方面的文档也很多,我把这段时间学习内存管理方面的知识记录在这里,涉及的代码太多,也没有太多仔细的去看代码,深入解算法,这篇文 ...

  4. slub

    1.前言 在Linux中,伙伴系统(buddy system)是以页为单位管理和分配内存.但是现实的需求却以字节为单位,假如我们需要申请20Bytes,总不能分配一页吧!那岂不是严重浪费内存.那么该如 ...

  5. KASAN实现原理【转】

    1. 前言 KASAN是一个动态检测内存错误的工具.KASAN可以检测全局变量.栈.堆分配的内存发生越界访问等问题.功能比SLUB DEBUG齐全并且支持实时检测.越界访问的严重性和危害性通过我之前的 ...

  6. linux内存源码分析 - 内存回收(匿名页反向映射)

    本文为原创,转载请注明:http://www.cnblogs.com/tolimit/ 概述 看完了内存压缩,最近在看内存回收这块的代码,发现内容有些多,需要分几块去详细说明,首先先说说匿名页的反向映 ...

  7. KASAN实现原理

    1. 前言 KASAN是一个动态检测内存错误的工具.KASAN可以检测全局变量.栈.堆分配的内存发生越界访问等问题.功能比SLUB DEBUG功能齐全并且支持实时检测.越界访问的严重性和危害性通过我之 ...

  8. Linux-3.14.12内存管理笔记【kmalloc与kfree实现】【转】

    本文转载自:http://blog.chinaunix.net/uid-26859697-id-5573776.html kmalloc()是基于slab/slob/slub分配分配算法上实现的,不少 ...

  9. ucoreOS_lab2 实验报告

    所有的实验报告将会在 Github 同步更新,更多内容请移步至Github:https://github.com/AngelKitty/review_the_national_post-graduat ...

随机推荐

  1. 第六章·Logstash深入-收集java日志

    1.通过Logstash收集java日志并输出到ES中 因为我们现在需要用Logstash收集tomcat日志,所以我们暂时将tomcat安装到Logstash所在机器,也就是db03:10.0.0. ...

  2. mybatis框架中 #和$传递参数的区别 和注意

    #{}: 1.  是预编译 2.  编译成占位符 3.  可以防止sql注入 4.  自动判断数据类型 5.  一个参数时,可以使用任意参数名称进行接收 ${}: 1.  非预编译 2.  sql的直 ...

  3. 【Swing】图形用户界面基础

    前言 简单总结一下图形用户界面(Graphical User Interface)的相关基础,如GUI的基本元素:窗口,以及介绍Java中的图形界面开发设计的技术. 图形用户界面 图形用户界面就是以图 ...

  4. 22_6mybatis中的缓存

    1.mybatis中的延时加载 问题:在一对多中,当我们有一个用户,它有100个账户. 在查询用户的时候,要不要把关联的账户查出来? 在查询账户的时候,要不要把关联的用户查出来? 在查询用户时,用户下 ...

  5. 获取header信息

    获取header信息 function _get_all_header() { // 忽略获取的header数据.这个函数后面会用到.主要是起过滤作用 $ignore = array('host',' ...

  6. php的lareval框架配置出错

    前两天,在学习php的 lareval 框架时,从官网上下载完lareval的安装包后,按照网上的配置教程一点一点的配置完后,当点击public 文件运行时就出现一个让我很头痛的问题,我自己外加两个大 ...

  7. JAVA内存存储分配粗略讲解

    以String类型为例:String s1 = "ABC"; String s2 = "ABC"; String s3 = new String("A ...

  8. Mybatis XML配置(转载)

    原文地址:https://www.w3cschool.cn/mybatis/f4uw1ilx.html Mapper XML 文件 MyBatis 的真正强大在于它的映射语句,也是它的魔力所在.由于它 ...

  9. js中for..of..和迭代器

    for..of是ES6中引入的新特性,它主要的作用是:循环一个可迭代的对象. 它可以循环遍历,数组.字符串.Set对象等等 示例一: let str = 'hello' for (item of st ...

  10. C/C++ - malloc/free和new/delete的区分

    new/delete与malloc/free的区别主要表现在以下几个方面: 注意:最主要的区别,new/delete是运算符,而malloc/free是函数 (1).new能够自动计算需要分配的内存空 ...