调试没有core文件的coredump

　　对coredump的分析中，是依赖于core文件的，而core文件中也几乎包含了程序当前的所有状态（堆栈、内存、寄存器等）。然而在实际的线上环境中，由于core文件太大、保存core文件耗时太久，出于线上系统的稳定性与快速恢复考虑，我们往往不会保留core文件。同时，程序堆栈被破坏的情况下，即使我们保留了core文件，也无法准确获取程序崩溃时准确的上下文信息

　　在不保留core文件的情况下，如何获取程序崩溃时候的上下文信息（主要是函数调用栈）

1.coredump原理

当程序发生内存越界访问等行为时，会触发OS的保护机制，此时OS会产生一个信号(signal)发送给对应的进程。当进程从内核态到用户态切换时，该进程会处理这个信号。此类信号（比如SEGV）的默认处理行为生成一个coredump文件。

这里会涉及以下几个问题：

1. 保存的core文件在什么地方？

2. core文件，具体会把进程地址空间的哪些内容保存下来？

3. 如何控制core文件的大小？

4. 如果在处理信号的时候，又产生了新的同类信号，该如何处理？

5. 处理信号的代码，是运行在用户态还是内核态？

6. 在一个多线程的程序中，是由哪个线程在处理这个信号？

/proc/sys/kernel/core_uses_pid` 取值是0或者1，表示是否在core文件名字后面加上进程号

`/proc/$pid/coredump_filter` 设置那些内存会被dump出来

- (bit 0) anonymous private memory
- (bit 1) anonymous shared memory
- (bit 2) file-backed private memory
- (bit 3) file-backed shared memory
- (bit 4) ELF header pages in file-backed private memory areas (it is effective only if the bit 2 is cleared)
- (bit 5) hugetlb private memory
- (bit 6) hugetlb shared memory
- (bit 7) DAX private memory
- (bit 8) DAX shared memory

ulimit -c ` 决定save的core文件大小限制

2.自定义信号处理函数

　　需要在自定义的信号处理函数中打印出程序崩溃时候的活跃函数堆栈信息。

这里我们有两种方式：

1.使用backtrace等方法，读取进程堆栈上的信息；

2.在函数调用的同时，用户自己维护一套数据结构，用于保存函数调用链，在信号处理函数中，将这个函数调用链打印出来

eg：

/**/

int bugreportsignal(const int sig)

{

    struct sigaction action;

    memset(&action, 0, sizeof(action));

    action.sa_sigaction = signal_core_bugreport;

    action.sa_flags = SA_SIGINFO;

    return (-1 != sigaction(sig, &action, NULL));

}

static atmic_int  first = 0;

static void signal_func_def(int sig)

{

    fprintf(stderr, "recv signal %d \n", sig);

}

inline static void signal_core_bugreport(const int sig, siginfo_t * info, void * ptr)

{

    switch (sig)

    {

    case SIGSEGV:

    case SIGABRT:

    case SIGFPE:

    case SIGILL:

    case SIGBUS:

    {

        signal(sig, signal_func);

        if (atomic_add(first, 1) == 1)

        {

            write_stack_msg(sig, info, ptr,"txt");

        }

        signal(sig, SIG_DFL);

        kill(getpid(),sig);//让其生成core文件

    }

    break;

    case SIGTERM:

    case SIGINT:

    case SIGQUIT:

    {

        bugreport_def_return(sig)

    }

    break;

    case SIGUSR1:

    {

         write_stack_msg(sig, info, ptr,"sigusr1");

        break;

    }

    case SIGUSR2:

    {

        write_stack_msg(sig, info, ptr,"sigusr2");

        break;

    }

    default:

        break;

    }

}

static inline void bugreport_def_return(const int sig)

{

snprintf(stderr, 255, " normally exit , pid:%d, sig:%d\n", getpid(), sig);

if (SIGTERM == sig || true)

{

　　signal(SIGTERM, SIG_DFL);

}

exit(0);

}

static inline void bugreport_save_task(int fd)

{

#define TASK_CMD  "ps -aux | grep %s | grep -v grep"

    FILE   *stream;

    char tmp_buf[1024] = {0};

    snprintf(tmp_buf, 1024, TASK_CMD, bugreport_process);

    stream = popen(tmp_buf,  "r");

    if(stream < 0) {

        return;

    }

    fprintf(fd, "ps -aux res:\n");

    while (fgets(tmp_buf, sizeof(tmp_buf), stream)) {

        fprintf(fd, "%s", tmp_buf);

    }

    pclose(stream);

}

static inline int write_stack_msg(const int sig, siginfo_t * info,

    void * ptr,const char* logfile_suffix)

{

    static const char * si_codes[3] = {"", "SEGV_MAPERR", "SEGV_ACCERR"};

    size_t i = 0;

    ucontext_t * ucontext = (ucontext_t *)ptr;

    unsigned long stack_start = 0;

    unsigned long stack_end = 0;

    snprintf(logpath, "xxxxxxxxxxxxxxxx", bugreport_logpath);

    foreach_stack_rang(gettid(), "/proc/getpid()/maps",&stack_start, &stack_end);

    umask(0);

    if (0 != mkdir(bugreport_logpath, 0755))

    {

    }

    snprintf(logfile, xxxx, "%s/%s_time().txt", bugreport_logpath, bugreport_process_name);

    int fd = open(logfile, O_RDWR | O_CREAT | O_APPEND, 0777);

    if (-1 == fd)

    {

        printf(stderr,"%s\n", errmsg);

        return 0;

    }

    time_t now;

    now = time(0);

    snprintf(stackinfo,""xxxxx""

             "time:%ld sig:%d{%s} pid:%d-xxxx--tpid:%ld\n-----xxx-----\n",

             now,

             sig,

             bugreport_signals[sig],

            getpid(),

            gettid());

    write(fd, stackinfo, strlen(stackinfo));

    int f = 0;

    Dl_info dl_info;

    void ** bp = 0;

    void * ip = 0;

    if (info->si_code >= 0 && info->si_code < 3) {

        snprintf(stackinfo, xxxxxx, "Segmentation Fault!\n"

             "info.si_signo = %d\n"

             "info.si_errno = %d\n"

             "info.si_code = %d (%s)\n"

             "info.si_pid = %d\n"

             "info.si_addr = %p\n",

             sig,

             info->si_errno,

             info->si_code,

             si_codes[info->si_code],

             info->si_pid,

             info->si_addr

            );

    } else {

        snprintf(stackinfo,xxxxx, "Segmentation Fault!\n"

             "info.si_signo = %d\n"

             "info.si_errno = %d\n"

             "info.si_code = %d\n"

             "info.si_pid = %d\n"

             "info.si_addr = %p\n",

             sig,

             info->si_errno,

             info->si_code,

             info->si_pid,

             info->si_addr

            );

    }

    write(fd, stackinfo, strlen(stackinfo));

    ip = (void *)ucontext->uc_mcontext.arm_pc;

    bp = (void **)ucontext->uc_mcontext.arm_fp;

    write(fd, "REG:\n", strlen("REG:\n"));

    for (i = 0; i < sizeof(ucontext->uc_mcontext)/sizeof(unsigned long); i++) {

        fprintf(fd, "\t%s: 0x%08lx", rname_index[i],

                ((unsigned long*)&ucontext->uc_mcontext)[i]);

        if (i % 4 == 3)

            fprintf(fd, "\n");

    }

    write(fd, "\nStack trace:\n\n", strlen("Stack trace:\n\n"));

    while (bp && ip)

    {

        if (!dladdr(ip, &dl_info))

        {

            bugreporteak;

        }

        const char * symname = dl_info.dli_sname;

        fprintf(fd, "stack #%02d: bp:%p %s [%p->%p] <%s+%ld>\n",

                 ++f,bp,

                 dl_info.dli_fname,

                 ip,

                 (void*)((intptr_t)ip - (intptr_t)dl_info.dli_fbase),

                 symname,

                 (intptr_t)ip - (intptr_t)dl_info.dli_saddr

                );

        if( !((unsigned long)bp > stack_start && (unsigned long)bp < stack_end) )

        {

            bugreporteak;

        }

        ip = bp[2];

        bp = (void **)bp[0];

    }

    write(fd, "End of stack trace\n", strlen("End of stack trace\n"));

    save_stacktrace(fd, ucontext->uc_mcontext.arm_sp);

    save_proc(fd,"maps");

    save_proc(fd,"status");

    bugreport_save_task(fd);

    bugreport_save_svninfo(fd);

    close(fd);

    return 0;

}

int signal_bugreport_setup()

{

bugreport_signal_cb(SIGSEGV);

bugreport_signal_cb(SIGABRT);

bugreport_signal_cb(SIGFPE)；

bugreport_signal_cb(SIGINT);

bugreport_signal_cb(SIGBUS);

bugreport_signal_cb(SIGILL);

bugreport_signal_cb(SIGQUIT);

bugreport_signal_cb(SIGTERM);

bugreport_signal_inore()(SIGHUP);

bugreport_signal_inore()(SIGPIPE);

//bugreport_signal_inore()(SIGCHLD);//忽略sigchld 会导致system函数返回值失效不能忽略

return 0;

}

static inline void bugreport_def_term(const int sig)

{

    char log[256];

    snprintf(log, 255, "Exit Normally, pid:%d, sig:%d\n", getpid(), sig);

    printf("%s", log);

    if (SIGTERM == sig)

    {

        signal(SIGTERM, SIG_DFL);

    }

    exit(0);

}

void dump_trace(int Signal)

{

    const int len = 200;

    void* buffer[len];

    printf("dump_trace\n");

    int nptrs = ::backtrace(buffer, len);

    printf("backtrace\n");

    char** buffer_array = ::backtrace_symbols(buffer, nptrs);

    printf("sig:%d nptrs:%d\n", Signal, nptrs);

    if (buffer_array) {

        for (int i = 0; i < nptrs; ++i) {

            printf("frame=%d||trace_back=%s||\n", i, buffer_array[i]);

        }

        free(buffer_array);

    }

    exit(0);

}

https://www.man7.org/linux/man-pages/man2/sigaction.2.html

 The siginfo_t argument to a SA_SIGINFO handler

       When the SA_SIGINFO flag is specified in act.sa_flags, the signal

       handler address is passed via the act.sa_sigaction field.  This han‐

       dler takes three arguments, as follows:

           void

           handler(int sig, siginfo_t *info, void *ucontext)

           {

               ...

           }

       These three arguments are as follows

       sig    The number of the signal that caused invocation of the han‐

              dler.

       info   A pointer to a siginfo_t, which is a structure containing fur‐

              ther information about the signal, as described below.

       ucontext

              This is a pointer to a ucontext_t structure, cast to void *.

              The structure pointed to by this field contains signal context

              information that was saved on the user-space stack by the ker‐

              nel; for details, see sigreturn(2).  Further information about

              the ucontext_t structure can be found in getcontext(3).  Com‐

              monly, the handler function doesn't make any use of the third

              argument.

       The siginfo_t data type is a structure with the following fields:

           siginfo_t {

               int      si_signo;     /* Signal number */

               int      si_errno;     /* An errno value */

               int      si_code;      /* Signal code */

               int      si_trapno;    /* Trap number that caused

                                         hardware-generated signal

                                         (unused on most architectures) */

               pid_t    si_pid;       /* Sending process ID */

               uid_t    si_uid;       /* Real user ID of sending process */

               int      si_status;    /* Exit value or signal */

               clock_t  si_utime;     /* User time consumed */

               clock_t  si_stime;     /* System time consumed */

               sigval_t si_value;     /* Signal value */

               int      si_int;       /* POSIX.1b signal */

               void    *si_ptr;       /* POSIX.1b signal */

               int      si_overrun;   /* Timer overrun count;

                                         POSIX.1b timers */

               int      si_timerid;   /* Timer ID; POSIX.1b timers */

               void    *si_addr;      /* Memory location which caused fault */

               long     si_band;      /* Band event (was int in

                                         glibc 2.3.2 and earlier) */

               int      si_fd;        /* File descriptor */

               short    si_addr_lsb;  /* Least significant bit of address

                                         (since Linux 2.6.32) */

               void    *si_lower;     /* Lower bound when address violation

                                         occurred (since Linux 3.19) */

               void    *si_upper;     /* Upper bound when address violation

                                         occurred (since Linux 3.19) */

               int      si_pkey;      /* Protection key on PTE that caused

                                         fault (since Linux 4.6) */

               void    *si_call_addr; /* Address of system call instruction

                                         (since Linux 3.5) */

               int      si_syscall;   /* Number of attempted system call

                                         (since Linux 3.5) */

               unsigned int si_arch;  /* Architecture of attempted system call

                                         (since Linux 3.5) */

           }

       si_signo, si_errno and si_code are defined for all signals.

       (si_errno is generally unused on Linux.)  The rest of the struct may

       be a union, so that one should read only the fields that are meaning‐

       ful for the given signal:

#include <execinfo.h>

int backtrace(void **buffer, int size);

char **backtrace_symbols(void *const *buffer, int size);

void backtrace_symbols_fd(void *const *buffer, int size, int fd)

backtrace函数通过指针数组buffer返回调用程序的回溯信息，也就是所谓的函数调用栈。buffer数组中的元素是void*类型，也就是栈中保存的返回地址。

size参数指定buffer中可以保存的地址的最大个数。如果实际的回溯信息大于size，则只返回最近的size个地址。

backtrace函数返回buffer中保存的地址个数，返回值不会大于size。如果返回值小于size，则说明所有的回溯信息都已经返回了，如果等于size，则有可能被截断了。

backtrace函数在buffer数组中返回的都是一些虚拟地址，不适于分析。backtrace_symbols函数可以将backtrace返回的buffer中的地址，根据符号表中的信息，转换为字符串（函数名+偏移地址）。size参数指明了buffer中的地址个数。

backtrace_symbols返回字符串数组的首地址，该字符串是在backtrace_symbols中通过malloc分配的，因此，调用者必须使用free释放内存。如果发生了错误，则backtrace_symbols返回NULL

backtrace_symbols_fd类似于backtrace_symbols，只不过它是把字符串信息写到文件描述符fd所表示的文件中。backtrace_symbols_fd不会调用malloc函数

来自网上转载的
#include <signal.h>

#include <execinfo.h>

#include <stdio.h>

#include <stdlib.h>

#include <ucontext.h>

#define BTSIZE 100

static void *getMcontextEip(ucontext_t *uc) {

#if defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6)

    /* OSX < 10.6 */

    #if defined(__x86_64__)

    return (void*) uc->uc_mcontext->__ss.__rip;

    #elif defined(__i386__)

    return (void*) uc->uc_mcontext->__ss.__eip;

    #else

    return (void*) uc->uc_mcontext->__ss.__srr0;

    #endif

#elif defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)

    /* OSX >= 10.6 */

    #if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__)

    return (void*) uc->uc_mcontext->__ss.__rip;

    #else

    return (void*) uc->uc_mcontext->__ss.__eip;

    #endif

#elif defined(__linux__)

    /* Linux */

    #if defined(__i386__)

    return (void*) uc->uc_mcontext.gregs[14]; /* Linux 32 */

    #elif defined(__X86_64__) || defined(__x86_64__)

    return (void*) uc->uc_mcontext.gregs[16]; /* Linux 64 */

    #elif defined(__ia64__) /* Linux IA64 */

    return (void*) uc->uc_mcontext.sc_ip;

    #endif

#else

    return NULL;

#endif

}

static void sig_handler(int sig, siginfo_t *info, void *secret)

{

    ucontext_t *uc = (ucontext_t*) secret;

    void *buffer[BTSIZE];

    char **strings;

    int nptrs = 0;

    printf("in sig_handler\n");

    printf("sig is %d, SIGSEGV is %d\n", sig, SIGSEGV);

    printf("info.si_signo is %d, info.si_addr is %p\n",

        info->si_signo, info->si_addr);

    if (sig == SIGSEGV)

    {

        nptrs = backtrace(buffer, BTSIZE);

        printf("backtrace() returned %d addresses\n", nptrs);

        if (getMcontextEip(uc) != NULL)

            buffer[1] = getMcontextEip(uc);

        strings = backtrace_symbols(buffer, nptrs);

        if (strings == NULL) {

            perror("backtrace_symbols");

            exit(EXIT_FAILURE);

        }

        printf("backtrace: \n");

        int j;

        for (j = 0; j < nptrs; j++)

        {

            printf("[%d]%s\n", j, strings[j]);

        }

        free(strings);

        exit(0);

    }

}

#ifdef CONFIG_ARM_UNWIND

static inline void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk)

{

    unwind_backtrace(regs, tsk);

}

#else

static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk)

{

    unsigned int fp, mode;

    int ok = 1;

    printk("Backtrace: ");

    if (!tsk)

        tsk = current;

    if (regs) {

        fp = frame_pointer(regs);

        mode = processor_mode(regs);

    } else if (tsk != current) {

        fp = thread_saved_fp(tsk);

        mode = 0x10;

    } else {

        asm("mov %0, fp" : "=r" (fp) : : "cc");

        mode = 0x10;

    }

    if (!fp) {

        pr_cont("no frame pointer");

        ok = 0;

    } else if (verify_stack(fp)) {

        pr_cont("invalid frame pointer 0x%08x", fp);

        ok = 0;

    } else if (fp < (unsigned long)end_of_stack(tsk))

        pr_cont("frame pointer underflow");

    pr_cont("\n");

    if (ok)

        c_backtrace(fp, mode);

}

注意，编译器的优化策略，可能导致得到的回溯信息不准确。而且，对于GUN编译器而言，必须使用-rdynamic链接选项( -rdynamic可用来通知链接器将所有符号添加到动态符号表中)，才能正确解析出符号名。此时可以使用unwind方法回溯

coredump文件本身主要的格式也是ELF格式，因此，我们可以通过readelf命令进行判断。

get_signal 这里没判断是不是信号是不是要触发core dump，然后调用do_coredump

最后会调用elf_core_dump以内核代码elf_core_dump函数为入口分析core文件怎么生成的：