perf_event 源码分析

前言

简单来说，perf是一种性能监测工具，它首先对通用处理器提供的performance counter进行编程，设定计数器阈值和事件，然后性能计数器就会在设定事件发生时递增计数器，直至这个计数器的计数值达到阈值，在不同的结构中对于计数器数值的提取有不同的方式，例如MIPS上会注册一个硬件中断，这样在计数器溢出时触发一个硬件中断，在中断处理函数中记录数值，x86中则是利用通知链机制，将溢出处理函数注册到die_chain通知链上，它会利用任何一个硬件中断发生的时机，检测性能计数器是否溢出，是则记录这个数值，这种实现方式就避免了单独为性能计数器溢出注册一个硬件中断。

perf源码分为用户层和内核层，用户层代码为用户提供命令行指定事件与采样方式，perf的一大特点就体现在丰富的用户层工具，可以说，内核部分代码只是为perf提供采样引擎，用户层才是perf的精华。用户层代码位于src/tools/perf目录下，c代码有13000行左右，此外还有大量的脚本程序。内核层代码分为结构无关代码（位于src/kernel/core/目录），和结构相关代码（位于src/arch/x86/cpu/**）。

这里先列个框架：首先从系统启动初始化开始，perf-init的相关工作，之后介绍用户层指定事件，通过系统调用转入内核，执行采样，采样数据通过内存映射返回给用户层，用户层工具进行上层分析并显示

perf_event源码分析(一)——cmd_record

perf's main entry

tools/perf/perf.c

static struct cmd_struct commands[] = {

    { "buildid-cache", cmd_buildid_cache, 0 },

    { "buildid-list", cmd_buildid_list, 0 },

    { "diff",   cmd_diff,   0 },

    { "evlist", cmd_evlist, 0 },

    { "help",   cmd_help,   0 },

    { "list",   cmd_list,   0 },

    { "record", cmd_record, 0 },

    { "report", cmd_report, 0 },

    { "bench",  cmd_bench,  0 },

    { "stat",   cmd_stat,   0 },

    { "timechart",  cmd_timechart,  0 },

    { "top",    cmd_top,    0 },

    { "annotate",   cmd_annotate,   0 },

    { "version",    cmd_version,    0 },

    { "script", cmd_script, 0 },

    { "sched",  cmd_sched,  0 },

#ifdef HAVE_LIBELF_SUPPORT

    { "probe",  cmd_probe,  0 },

#endif

    { "kmem",   cmd_kmem,   0 },

    { "lock",   cmd_lock,   0 },

    { "kvm",    cmd_kvm,    0 },

    { "test",   cmd_test,   0 },

#ifdef HAVE_LIBAUDIT_SUPPORT

    { "trace",  cmd_trace,  0 },

#endif

    { "inject", cmd_inject, 0 },

    { "mem",    cmd_mem,    0 },

    { "data",   cmd_data,   0 },

};

perf record's CALL CHAIN:

cmd_record

	;; new a struct "record" rec, and a struct "evlist" in rec->evlist;

	perf_evlist__new

	perf_config

	__cmd_record(&record, argc, argv); // fill out "struct record"

		perf_session__new(file, false, tool); // New a sesssion for this rec, rec->session, attention: file is "struct perf_data_file *file",  &rec->file;

			machines__init(&session->machines);

			ordered_events__init(&session->ordered_events, ordered_events__deliver_event);

			perf_data_file__open(file)

				check_pipe(file)

				file->path = "perf.data" // If not specified name, fill out file->path

				open_file(file);

					fd = perf_data_file__is_read(file) ? open_file_read(file) : open_file_write(file);

					file->fd = fd;

			perf_session__create_kernel_maps(session) //

		fd = perf_data_file__fd(file); // Get rec's fd, rec->file->fd

		record__init_features(rec);

			perf_header__set_feat // Fill out session's header of this rec, rec->session->header

		record__open(rec)

			perf_evlist__config(evlist, opts); // perf_evlist

				perf_evsel__config(evsel, opts); // perf_evsel

		perf_header__clear_feat

		perf_header__write_pipe / perf_session__write_header

		perf_event__synthesize_kernel_mmap(tool, process_synthesized_event, machine);

		perf_event__synthesize_modules(tool, process_synthesized_event, machine);

		machines__process_guests(&session->machines,perf_event__synthesize_guest_os, tool);

		__machine__synthesize_threads(machine, tool, &opts->target, rec->evlist->threads,process_synthesized_event, opts->sample_address);

tools/perf/builtin-record.c

int cmd_record(int argc, const char **argv, const char *prefix __maybe_unused)

{

    int err = -ENOMEM;

    struct record *rec = &record;

    char errbuf[BUFSIZ];

    rec->evlist = perf_evlist__new();

    if (rec->evlist == NULL)

        return -ENOMEM;

    perf_config(perf_record_config, rec);  // 解析, tools/perf/util/config.c

    argc = parse_options(argc, argv, record_options, record_usage,

                PARSE_OPT_STOP_AT_NON_OPTION);

    if (!argc && target__none(&rec->opts.target))

        usage_with_options(record_usage, record_options);

    if (nr_cgroups && !rec->opts.target.system_wide) {

        ui__error("cgroup monitoring only available in"

              " system-wide mode\n");

        usage_with_options(record_usage, record_options);

    }

}

tools/perf/util/parse-events.c

setup_events // tools/perf/builtin-stat.c

	parse_events // tools/perf/util/parse-events.c

parse_events  // tools/perf/util/parse-events.c

int parse_events(struct perf_evlist *evlist, const char *str)

{

    struct parse_events_evlist data = {

        .list = LIST_HEAD_INIT(data.list),

        .idx  = evlist->nr_entries,

    };

    int ret;

    ret = parse_events__scanner(str, &data, PE_START_EVENTS);

    perf_pmu__parse_cleanup();

    if (!ret) {

        int entries = data.idx - evlist->nr_entries;

        perf_evlist__splice_list_tail(evlist, &data.list, entries);

        evlist->nr_groups += data.nr_groups;

        return 0;

    }

    /*

     * There are 2 users - builtin-record and builtin-test objects.

     * Both call perf_evlist__delete in case of error, so we dont

     * need to bother.

     */

    return ret;

}

struct introduction

tools/perf/util/target.h

struct target {

    const char   *pid;

    const char   *tid;

    const char   *cpu_list;

    const char   *uid_str;

    uid_t        uid;

    bool         system_wide;

    bool         uses_mmap;

    bool         default_per_cpu;

    bool         per_thread;

};

===

tools/perf/util/data.h

struct perf_data_file {

    const char      *path;

    int          fd;

    bool             is_pipe;

    bool             force;

    unsigned long        size;

    enum perf_data_mode  mode;

};

=== 

tools/perf/util/session.h

struct perf_session {

    struct perf_header  header;

    struct machines     machines;

    struct perf_evlist  *evlist;

    struct trace_event  tevent;

    bool            repipe;

    bool            one_mmap;

    void            *one_mmap_addr;

    u64         one_mmap_offset;

    struct ordered_events   ordered_events;

    struct perf_data_file   *file;

    struct perf_tool    *tool;

};

===

tools/perf/util/evlist.h 

struct perf_evlist {

    struct list_head entries;

    struct hlist_head heads[PERF_EVLIST__HLIST_SIZE];

    int      nr_entries;

    int      nr_groups;

    int      nr_mmaps;

    size_t       mmap_len;

    int      id_pos;

    int      is_pos;

    u64      combined_sample_type;

    struct {

        int cork_fd;

        pid_t   pid;

    } workload;

    bool         overwrite;

    struct fdarray   pollfd;

    struct perf_mmap *mmap;

    struct thread_map *threads; // threads

    struct cpu_map    *cpus;   // cpus

    struct perf_evsel *selected;

    struct events_stats stats;

};

=== 

/** struct perf_evsel - event selector **/

Each event passed from user mapping one perf_evsel struct. 

struct perf_evsel {

    struct list_head    node;

    struct perf_event_attr  attr;

    char            *filter;

    struct xyarray      *fd;

    struct xyarray      *sample_id;

    u64         *id;

    struct perf_counts  *counts;

    struct perf_counts  *prev_raw_counts;

    int         idx;

    u32         ids;

    char            *name;

    double          scale;

    const char      *unit;

    bool            snapshot;

    struct event_format *tp_format;

    ...

    ...

    struct perf_evsel   *leader;

}

=== 

tools/perf/builtin-record.c

struct record {

    struct perf_tool    tool;

    struct record_opts  opts;

    u64         bytes_written;

    struct perf_data_file   file;

    struct perf_evlist  *evlist;

    struct perf_session *session;

    const char      *progname;

    int         realtime_prio;

    bool            no_buildid;

    bool            no_buildid_cache;

    long            samples;

};

===

Here is important, perf_stat is an array include three "struct stats" in "perf_stat",

and will init perf_stat:

    for (i = 0; i < 3; i++)

        init_stats(&ps->res_stats[i]);

struct perf_stat {

    struct stats      res_stats[3];

};

tools/perf/util/stat.h

struct stats

{

    double n, mean, M2;

    u64 max, min;

};

====

tools/perf/util/evsel.h

struct perf_counts_values {

    union {

        struct {

            u64 val;

            u64 ena;

            u64 run;

        };

        u64 values[3];

    };

};

struct perf_counts {

    s8            scaled;

    struct perf_counts_values aggr;

    struct perf_counts_values cpu[];

};

perf stat's CALL CHAIN

CALL CHAIN:

commands // tools/perf/perf.c

	cmd_stat // tools/perf/builtin-stat.c

		parse_events_option // If perf stat -e xxx, specified event name, will check this event name

			parse_events

 				parse_events__scanner // check events

 					parse_events_lex_init_extra

 					parse_events__scan_string

 					parse_events_parse

 					parse_events__flush_buffer

 					parse_events__delete_buffer

 					parse_events_lex_destroy

				perf_pmu__parse_cleanup:

		perf_evlist__new();

			perf_evlist__init(struct perf_evlist *evlist, struct cpu_map *cpus, struct thread_map *threads) // evlist->cpus, evlist->threads

				perf_evlist__set_maps ///

		parse_options

		parse_options_usage

		add_default_attributes()

		target__validate(&target);

		perf_evlist__create_maps(evsel_list, &target) // fill out evlist->threads(thread_map)

			evlist->threads = thread_map__new_str(target->pid, target->tid,target->uid); // evlist->threads

			evlist->threads(thread_map) = [tid,tid,tid,tid,...]

			target__uses_dummy_map(target)

				evlist->cpus = cpu_map__dummy_new() // evlist->cpus

				evlist->cpus = cpu_map__new(target->cpu_list)

		perf_evlist__alloc_stats(evsel_list, interval)  // Traverse all evsel

			evlist__for_each(evlist, evsel) {

				perf_evsel__alloc_stat_priv(evsel) // Alloc memory for each evsel->priv = zalloc(sizeof(struct perf_stat));

					perf_evsel__reset_stat_priv(evsel)

						init_stats // Fill out "struct perf_stat", perf_stat include 3 elements of "struct stats{}"

				perf_evsel__alloc_counts(evsel, perf_evsel__nr_cpus(evsel)) //  Alloc evsel->counts

				alloc_raw && perf_evsel__alloc_prev_raw_counts(evsel) // Alloc evsel->prev_raw_counts =  addr;

			}

		perf_stat_init_aggr_mode()

			cpu_map__build_socket_map

				cpu_map__build_map(cpus, sockp, cpu_map__get_socket);

				cpu_map__get_socket

			cpu_map__build_core_map

				cpu_map__build_map(cpus, corep, cpu_map__get_core);

				cpu_map__get_core

					cpu_map__get_socket

		run_perf_stat(argc, argv);

			__run_perf_stat(argc, argv);

				perf_evlist__prepare_workload(evsel_list, &target, argv, false, workload_exec_failed_signal)

				perf_evlist__set_leader(evsel_list); // evlist->nr_groups  = 1 or 0 ? decide by evlist->nr_entries > 1 or not

					__perf_evlist__set_leader(&evlist->entries);

					evlist__for_each(evsel_list, evsel) {  // Traverse all evsel

						create_perf_stat_counter(evsel)

							struct perf_event_attr *attr = &evsel->attr;

							attr->xxx  = xxx

							perf_evsel__open_per_cpu(evsel, perf_evsel__cpus(evsel)

							perf_evsel__is_group_leader(evsel)

							perf_evsel__open_per_thread(evsel, evsel_list->threads)

								// important: __perf_evsel__open(struct perf_evsel *evsel, struct cpu_map *cpus, struct thread_map *threads)

								__perf_evsel__open(evsel, &empty_cpu_map.map, threads)

									// perf_evsel__alloc_fd(struct perf_evsel *evsel, int ncpus, int nthreads), if system_wide: nthreads = 1

									perf_evsel__alloc_fd(evsel, cpus->nr, nthreads)

										evsel->fd = xyarray__new(ncpus, nthreads, sizeof(int));

									for (cpu = 0; cpu < cpus->nr; cpu++) {

										 for (thread = 0; thread < nthreads; thread++) {

										 	group_fd = get_group_fd(evsel, cpu, thread);

										 	sys_perf_event_open(&evsel->attr, pid, cpus->map[cpu], group_fd, flags);

										 }

									}

					}

					perf_evlist__apply_filters(evsel_list, &counter)

					evlist__for_each(evlist, evsel) {

						perf_evsel__set_filter(evsel, ncpus, nthreads, evsel->filter);

					}

					t0 = rdclock();

					clock_gettime(CLOCK_MONOTONIC, &ref_time);

					if (forks) {

						perf_evlist__start_workload(evsel_list);

						handle_initial_delay();

						if (interval) {

							print_interval();

						}

					} else {

						handle_initial_delay();

						print_interval();

					}

					t1 = rdclock();

					update_stats(&walltime_nsecs_stats, t1 - t0);

					// 开始为每个evsel读

					if (aggr_mode == AGGR_GLOBAL) {

						evlist__for_each(evsel_list, counter) {

							// 读到struct: "struct perf_counts_values", 保存在evsel的 &counter->counts->aggr , （这里evsel 就是counter）

							// 还有“struct perf_stat” ， counter->priv

							read_counter_aggr(counter);

								aggr->val = aggr->ena = aggr->run = 0; // 这里， 把 perf_counts_values aggr 全部初始化为0

								read_counter(counter)  // 如何读此event？遍历每个thread和cpu

									int nthreads = thread_map__nr(evsel_list->threads);

									int ncpus = perf_evsel__nr_cpus(counter);

									int cpu, thread;

									for (thread = 0; thread < nthreads; thread++) {

										for (cpu = 0; cpu < ncpus; cpu++) {

											// pocess + cpu 二维数组方式读, 读到 "struct  perf_counts_values count"

											process_per_cpu(struct perf_evsel *evsel, int cpu, int thread))

												perf_evsel__read_cb(evsel, cpu, thread, &count)

													memset(count, 0, sizeof(*count));

													FD(evsel, cpu, thread)

													readn(FD(evsel, cpu, thread), count, sizeof(*count))

														ion(true, fd, buf, n);

															read(fd, buf, left)

												read_cb(evsel, cpu, thread, tmp);

													switch (aggr_mode) {

														case AGGR_CORE:

														case AGGR_SOCKET:

														case AGGR_NONE:

														perf_evsel__compute_deltas(evsel, cpu, count);

														perf_counts_values__scale(count, scale, NULL);

														update_shadow_stats(evsel, count->values, cpu);

													}

										}

									}

							perf_evsel__close_fd(counter, perf_evsel__nr_cpus(counter), thread_map__nr(evsel_list->threads));

						}

					} else {

						evlist__for_each(evsel_list, counter) {

							read_counter(counter);

							perf_evsel__close_fd(counter, perf_evsel__nr_cpus(counter), 1);

						}

					}

		print_stat

			print_aggr // AGGR_CORE AGGR_SOCKET

			print_counter_aggr(evsel, NULL); // AGGR_GLOBAL

			print_counter(evsel, NULL) // AGGR_NONE

tools/perf/util/evsel.h

struct perf_evsel {

}

perf-perf stat用户层代码分析的更多相关文章

Express实例代码分析1——简单的用户验证登录文件
/** * Module dependencies. */ var express = require('../..');// ../..是上级目录的上级目录 var hash = require(' ...
完整全面的Java资源库（包括构建、操作、代码分析、编译器、数据库、社区等等）
构建这里搜集了用来构建应用程序的工具. Apache Maven:Maven使用声明进行构建并进行依赖管理,偏向于使用约定而不是配置进行构建.Maven优于Apache Ant.后者采用了一种过程化 ...
wifi display代码分析
转自:http://blog.csdn.net/lilian0118/article/details/23168531 这一章中我们来看Wifi Display连接过程的建立,包含P2P的部分和RTS ...
Linux从用户层到内核层系列 - GNU系列之glibc介绍
题记:本系列文章的目的是抛开书本从源代码和使用的角度分析Linux内核和相关源代码,byhankswang和你一起玩转linux开发轻松搞定TCP/IP协议栈,原创文章欢迎交流, byhankswa ...
虚拟机创建流程中neutron代码分析（三）
前言: 当neutron-server创建了port信息,将port信息写入数据库中.流程返回到nova服务端,接着nova创建的流程继续走.在计算节点中neutron-agent同样要完成很多的工作 ...
Android Hal层简要分析
Android Hal层简要分析 Android Hal层(即 Hardware Abstraction Layer)是Google开发的Android系统里上层应用对底层硬件操作屏蔽的一个软件层次, ...
【转载】word2vec原理推导与代码分析
本文的理论部分大量参考<word2vec中的数学原理详解>,按照我这种初学者方便理解的顺序重新编排.重新叙述.题图来自siegfang的博客.我提出的Java方案基于kojisekig,我 ...
OVS 内核KEY值提取及匹配流表代码分析
原文链接:http://ry0117.com/2016/12/24/OVS内核KEY值提取及匹配流表代码分析/ 当开启OVS后,创建datapath类型为system的网桥并他添加相关接口,OVS网桥 ...
Https与Http，SSL,DevOps, 静态代码分析工具，RFID, SSH, 非对称加密算法(使用最广泛的一种是RSA)，数字签名，数字证书
在URL前加https://前缀表明是用SSL加密的. 你的电脑与服务器之间收发的信息传输将更加安全. Web服务器启用SSL需要获得一个服务器证书并将该证书与要使用SSL的服务器绑定. http和h ...

随机推荐

怎样一步步用D3画多曲线
Bar Chart: http://bl.ocks.org/mbostock/3885304 这是一个画柱状图的基本形式. Axis是数轴: tickets是数轴上的标尺.tickets第二个參数% ...
apple air装双系统（win7）
同事买了一个apple air.用不习惯,希望再装个win7,经过多次试验,得到例如以下操作方法: 1.在MAC系统里的"有用工具"中找到"Boot Camp 助理 ...
Python中range和xrange的异同之处
range 函数说明:range([start,] stop[, step]).依据start与stop指定的范围以及step设定的步长,生成一个序列. range演示样例: >> ...
tomcat的localhost_access_log日志文件
一.服务器打印日志要关闭hibernate的日志,首先要把hibernate.show_sql设置为false;然后设置log4j.properties. # Control logging for ...
HDFS02
读取流程写流程 ============SecondaryNameNode============ Namenode的一个快照周期性的备份namenode 记录namenode中的metadata ...
java Map 转 List
public static void testMapVoid () { Map map = new HashMap(); map.put("a", "a1"); ...
anaconda安装python三方包，以tensorflow为例
Anaconda概述 Anaconda是一个用于科学计算的Python发行版,支持 Linux, Mac, Windows系统,提供了包管理与环境管理的功能,可以很方便地解决多版本python并存.切 ...
ngRoute (angular-route.js) 和 ui-router (angular-ui-router.js) 模块有什么不同呢？
ngRoute (angular-route.js) 和 ui-router (angular-ui-router.js) 模块有什么不同呢? 很多文章中都有说道:当时ngRoute在路由配置时用$r ...
44. Ext信息提示对话框
转自:https://www.cnblogs.com/glsqh/p/5920500.html Ext.window.MessageBox是一个工具类,他继承自Ext.window.Windoe对象, ...
常用进制的转换、进制数的and与or或xor异或运算
[十进制转换成其他进制]例:将25转换为二进制数解: 25÷2=12 余数1 12÷2=6 余数0 6÷2=3 余数0 3÷2=1 余数1 1÷2=0 余数1 所 ...

perf-perf stat用户层代码分析