Linux内核分析3

周子轩创作品转载请注明出处《Linux内核分析》MOOC课程http://mooc.study.163.com/course/USTC-1000029000

gdb跟踪start_kernel()函数

start_kernel()为系统内核的入口函数，该函数主要是对环境和组件等进行初始化操作。
课程中通过运行跟踪MenuOS来了解从start_kernel()做了哪些事情，以下为我跟踪调试的截图。后面将分析函数的执行过程。

分析`start_kernel()`函数

start_kernel(): /linux-3.18.6/init/main.c

asmlinkage __visible void __init start_kernel(void)
{
    // ...
    trap_init();
    mm_init();
    /*
     * Set up the scheduler prior starting any interrupts (such as the
     * timer interrupt). Full topology setup happens at smp_init()
     * time - but meanwhile we still have a functioning scheduler.
     */
    sched_init();
    // ...
    /* Do the rest non-__init'ed, we're now alive */
    rest_init();
}

这个函数的大部分代码被省略，留下关注的4个初始化函数，其实是3个，mm_init()是内存管理初始化，暂时不分析。

`trap_init()`

中断向量表的初始化函数，设置了很多中断门(Interrupt Gate)，其中设置了后面会关注到的system_call
trap_init(): /linux-3.18.6/arch/x86/kernel/traps.c

void __init trap_init(void)
{
    // ...
#ifdef CONFIG_X86_32
    set_system_trap_gate(SYSCALL_VECTOR, &system_call);
    set_bit(SYSCALL_VECTOR, used_vectors);
#endif
    // ...
}

`sched_init()`

进程调度初始化函数，函数内做了很关键的一步初始化——对0号进程，即idle进程进行初始化

`rest_init()`

其他初始化函数，函数内将创建1号进程，即init进程。下面主要来分析该函数。

分析`rest_init()`函数

rest_init(): /linux-3.18.6/init/main.c

static noinline void __init_refok rest_init(void)
{
    int pid;
    rcu_scheduler_starting();
    /*
     * We need to spawn init first so that it obtains pid 1, however
     * the init task will end up wanting to create kthreads, which, if
     * we schedule it before we create kthreadd, will OOPS.
     */
    kernel_thread(kernel_init, NULL, CLONE_FS);
    numa_default_policy();
    pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES);
    rcu_read_lock();
    kthreadd_task = find_task_by_pid_ns(pid, &init_pid_ns);
    rcu_read_unlock();
    complete(&kthreadd_done);
    /*
     * The boot idle thread must execute schedule()
     * at least once to get things moving:
     */
    init_idle_bootup_task(current);
    schedule_preempt_disabled();
    /* Call into cpu_idle with preempt disabled */
    cpu_startup_entry(CPUHP_ONLINE);
}

其中，创建了一个线程kernel_init，不过内核中并没有线程的概念，这里创建的是一个任务(进程)。如果把系统当成是一个大进程，那么init进程就相当于大进程中的一个线程，因为内核中的进程调度会为每一个像init进程分配时间片来执行。

kernel_thread(kernel_init, NULL, CLONE_FS);

kernel_init(): /linux-3.18.6/init/main.c

static int __ref kernel_init(void *unused)
{
    int ret;
    kernel_init_freeable();
    /* need to finish all async __init code before freeing the memory */
    async_synchronize_full();
    free_initmem();
    mark_rodata_ro();
    system_state = SYSTEM_RUNNING;
    numa_default_policy();
    flush_delayed_fput();
    if (ramdisk_execute_command) {
        ret = run_init_process(ramdisk_execute_command);
        if (!ret)
            return 0;
        pr_err("Failed to execute %s (error %d)\n",
               ramdisk_execute_command, ret);
    }
    /*
     * We try each of these until one succeeds.
     *
     * The Bourne shell can be used instead of init if we are
     * trying to recover a really broken machine.
     */
    if (execute_command) {
        ret = run_init_process(execute_command);
        if (!ret)
            return 0;
        pr_err("Failed to execute %s (error %d).  Attempting defaults...\n",
            execute_command, ret);
    }
    if (!try_to_run_init_process("/sbin/init") ||
        !try_to_run_init_process("/etc/init") ||
        !try_to_run_init_process("/bin/init") ||
        !try_to_run_init_process("/bin/sh"))
        return 0;
    panic("No working init found.  Try passing init= option to kernel. "
          "See Linux Documentation/init.txt for guidance.");
}

代码中有3个if，这三个if分别以三种不同方式来启动init，但只会有1个init会被启动

如果ramdisk_execute_command被设置，执行-initrd指定的rootfs中的init
如果execute_command有值，执行命令行传入的init（猜测）
最后再在系统文件中的/sbin/init; /etc/init; /bin/init; /bin/sh查找文件是否存在，存在即作为1号进程启动

到此1号进程的启动分析完成
下面分析0号idle进程从哪里启动

static noinline void __init_refok rest_init(void)
{
    // ...
    /* Call into cpu_idle with preempt disabled */
    cpu_startup_entry(CPUHP_ONLINE);
}

在rest_init()函数的末尾，0号进程idle就是在这里启动的。
cpu_startup_entry(): /linux-3.18.6/kernel/sched/idle.c

void cpu_startup_entry(enum cpuhp_state state)
{
    /*
     * This #ifdef needs to die, but it's too late in the cycle to
     * make this generic (arm and sh have never invoked the canary
     * init for the non boot cpus!). Will be fixed in 3.11
     */
#ifdef CONFIG_X86
    /*
     * If we're the non-boot CPU, nothing set the stack canary up
     * for us. The boot CPU already has it initialized but no harm
     * in doing it again. This is a good place for updating it, as
     * we wont ever return from this function (so the invalid
     * canaries already on the stack wont ever trigger).
     */
    boot_init_stack_canary();
#endif
    arch_cpu_idle_prepare();
    cpu_idle_loop();
}
static void cpu_idle_loop(void)
{
    while (1) {
        /*
         * If the arch has a polling bit, we maintain an invariant:
         *
         * Our polling bit is clear if we're not scheduled (i.e. if
         * rq->curr != rq->idle).  This means that, if rq->idle has
         * the polling bit set, then setting need_resched is
         * guaranteed to cause the cpu to reschedule.
         */
        __current_set_polling();
        tick_nohz_idle_enter();
        while (!need_resched()) {
            check_pgt_cache();
            rmb();
            if (cpu_is_offline(smp_processor_id()))
                arch_cpu_idle_dead();
            local_irq_disable();
            arch_cpu_idle_enter();
            /*
             * In poll mode we reenable interrupts and spin.
             *
             * Also if we detected in the wakeup from idle
             * path that the tick broadcast device expired
             * for us, we don't want to go deep idle as we
             * know that the IPI is going to arrive right
             * away
             */
            if (cpu_idle_force_poll || tick_check_broadcast_expired())
                cpu_idle_poll();
            else
                cpuidle_idle_call();
            arch_cpu_idle_exit();
        }
        /*
         * Since we fell out of the loop above, we know
         * TIF_NEED_RESCHED must be set, propagate it into
         * PREEMPT_NEED_RESCHED.
         *
         * This is required because for polling idle loops we will
         * not have had an IPI to fold the state for us.
         */
        preempt_set_need_resched();
        tick_nohz_idle_exit();
        __current_clr_polling();
        /*
         * We promise to call sched_ttwu_pending and reschedule
         * if need_resched is set while polling is set.  That
         * means that clearing polling needs to be visible
         * before doing these things.
         */
        smp_mb__after_atomic();
        sched_ttwu_pending();
        schedule_preempt_disabled();
    }
}