pesticide/main.cpp

#include <stdio.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#include <fcntl.h>
#include <elf.h>
#include <cstddef>
#include <cstring>
#include <cstdlib>
#include <assert.h>

#include "enums.h"
#include "enum_names.h"

// STD
// TODO: maybe do not use this
#include <vector>

#define _STRINGIFY(symbol) #symbol
#define STRINGIFY(symbol) _STRINGIFY(symbol)


// TODO: dwarf32
// page 200
typedef struct __attribute__((packed)) {
  uint32_t preface;              // should be 0xFFFFFFFF for 64b
  uint64_t unit_length;
  uint16_t version;              // should be 5
  uint8_t unit_type;             // should be DW_UT_compile or DW_UT_partial
  uint8_t address_size;          // should be 8 for 64b systems
  uint64_t debug_abbrev_offset;
} compilation_unit_header_t;


// TODO: dwarf32
typedef struct __attribute__((packed)) {
  uint32_t preface;              // should be 0xFFFFFFFF for 64b
  uint64_t unit_length;
  uint16_t version;
  uint64_t debug_info_offset;
  uint8_t address_size;          // should be 8 for 64b systems
  uint8_t segment_selector_size;
} debug_aranges_header_t;

static_assert(sizeof(debug_aranges_header_t) == 24);

typedef struct {
  uint64_t abbrev_code; // ?
  uint32_t abbrev_tag;  // DW_TAG_*
  uint8_t children;     // DW_CHILDREN_*
} debug_abbrev_entry_t;

typedef struct {
  uint64_t name;       // DW_AT_*
  uint64_t form;       // DW_FORM_*
  uint64_t value;      // if DW_FORM_implicit_const
} attribute_spec_t;

// TODO: Store this in a more space-sensitive way
typedef struct {
  std::vector<debug_abbrev_entry_t> entries;
  std::vector<std::vector<attribute_spec_t>> specs;
} abbrev_table_t;

abbrev_table_t abbrev_table;

// function signatures
void parse_debuginfo_section(const void *file);
void parse_aranges_section(const void *file, uint64_t offset);
uint64_t get_alignment_of_section(const void *file, const char *name);
uint64_t get_offset_of_section(const void *file, const char *name);
void parse_section_names(const void *file, Elf64_Shdr* table, int n, int strtab_index);
void print_header(Elf64_Ehdr *header);

// TODO: Deal with cases where size of LEB128 > word size
inline int decode_leb128(uint8_t *src, uint64_t *dest)
{
    *dest = 0;
    int shift = 0;
    uint8_t val;
    do {
      val = *src++;
      *dest |= (val & 0x7f) << shift;
      shift += 7;
    } while (val & 0x80);
    // return the number of bytes that we should move the src pointer
    return shift / 8 + (shift % 8 != 0);
}

// globals
uint64_t cu_header_offset = 0;

void parse_debuginfo_section(const void *file)
{
    char *p = (char*)file;

    uint64_t debuginfo_offset = get_offset_of_section(file, ".debug_info");
    fprintf(stdout, "Offset of .debug_info section: %d\n", debuginfo_offset);
    fprintf(stdout, "Offset of CUH in section: %d\n", cu_header_offset);
    p += debuginfo_offset + cu_header_offset;

    compilation_unit_header_t* cuh = (compilation_unit_header_t*)p;

    fprintf(stdout, "CUH length: %#x\n", cuh->unit_length);
    fprintf(stdout, "CUH version: %d\n", cuh->version);
    // TODO: multifile dwarf
    fprintf(stdout, "CUH unit_type: %d (%s)\n", cuh->unit_type, (cuh->unit_type == DW_UT_compile ? STRINGIFY(DW_UT_compile) : "?"));
    fprintf(stdout, "CUH address_size: %d\n", cuh->address_size);
    fprintf(stdout, "CUH debug_abbrev_offset: %d\n", cuh->debug_abbrev_offset);

    uint64_t abbrev_section_offset = get_offset_of_section(file, ".debug_abbrev");
    // Pointer to .debug_abbrev section memory for current CU
    char *cu_abbrev = ((char*)file) + abbrev_section_offset + cuh->debug_abbrev_offset;

    fprintf(stdout, "ABBREV for this compilation unit should start at: %x\n", abbrev_section_offset + cuh->debug_abbrev_offset);

    // begin parsing proper
    uint64_t abbrev_code;
    uint64_t abbrev_tag;

    // TODO: Error handling
    do {
      cu_abbrev += decode_leb128((uint8_t*)cu_abbrev, &abbrev_code);
      // The entries for a compilation unit end with a 0-byte abbrev code
      if (!abbrev_code)
        break;
      cu_abbrev += decode_leb128((uint8_t*)cu_abbrev, &abbrev_tag);

      bool children = *cu_abbrev++;
      fprintf(stdout, " %d %s, children = %s\n",  abbrev_code, dwarf_get_TAG_name(abbrev_tag), children ? "yes" : "no");

      abbrev_table.entries.push_back({abbrev_code, (uint32_t)abbrev_tag, children});

      std::vector<attribute_spec_t> attr_specs = {}; // for this code

      uint64_t attrib_name;
      uint64_t attrib_form;
      uint64_t implicit_const = 0;
      // TODO: Error handling
      do {
        cu_abbrev += decode_leb128((uint8_t*)cu_abbrev, &attrib_name);
        cu_abbrev += decode_leb128((uint8_t*)cu_abbrev, &attrib_form);
        if (attrib_form == DW_FORM_implicit_const)
        {
            cu_abbrev += decode_leb128((uint8_t*)cu_abbrev, &implicit_const);
            fprintf(stdout, "\t%-26s %s value: %d\n", dwarf_get_AT_name(attrib_name), dwarf_get_FORM_name(attrib_form), implicit_const);
        }
        else if (!attrib_name && !attrib_form)
          fprintf(stdout, "\t0\n");
        else
          fprintf(stdout, "\t%-26s %s\n", dwarf_get_AT_name(attrib_name), dwarf_get_FORM_name(attrib_form));

        if (attrib_name && attrib_form)
        {
          attr_specs.push_back({attrib_name, attrib_form, implicit_const});
        }
      } while (attrib_name && attrib_form);
      abbrev_table.specs.push_back(attr_specs);
    } while (true);

    p += sizeof(compilation_unit_header_t);

    fprintf(stdout, "==================================\n");
    uint64_t code;
    fprintf(stdout, ".debug_info offset for first DIE: %#x\n", ((uint64_t)p - (uint64_t)file));
    // TODO: 32bit size
    // TODO: unit_length is supposed to be 4/12 Bytes
    for (; ((uint64_t)p - (uint64_t)cuh) < cuh->unit_length + sizeof(compilation_unit_header_t) - 12;)
    {
        p += decode_leb128((uint8_t*)p, &code);

        fprintf(stdout, "%d (%s)\n", code, dwarf_get_TAG_name(abbrev_table.entries[code-1].abbrev_tag));
        for (size_t i = 0; i < abbrev_table.specs[code-1].size(); ++i)
        {
            fprintf(stdout, "\t%s\t\t", dwarf_get_AT_name(abbrev_table.specs[code-1][i].name));
            // Get the desired value according to FORM
            switch (abbrev_table.specs[code-1][i].form)
            {
            case DW_FORM_addr: // 0x01
              {
                  // TODO: 32bit addresses
                  uint64_t addr_value = *(uint64_t*)p;
                  p+= 8;
                  fprintf(stdout, "\t%#x\n", addr_value);
              } break;
            case DW_FORM_data2: // 0x05
              {
                  uint16_t data = *(uint16_t*)p;
                  p += 2;
                  fprintf(stdout, "\t%#x\n", data);
              } break;
            case DW_FORM_data4: // 0x06
              {
                  uint32_t data = *(uint32_t*)p;
                  p += 4;
                  fprintf(stdout, "\t%#x\n", data);
              } break;
            case DW_FORM_data8: // 0x06
              {
                  uint64_t data = *(uint64_t*)p;
                  p += 8;
                  fprintf(stdout, "\t%#x\n", data);
              } break;
            case DW_FORM_string: // 0x08
              {
                  // String is inside the .debug_info, just read it and advance pointer past null terminator
                  fprintf(stdout, "\t%s\n", p);
                  while (*p++);
              } break;
            case DW_FORM_data1: // 0x13
              {
                  uint8_t data = *(uint8_t*)p;
                  p += 1;
                  if (abbrev_table.specs[code-1][i].name == DW_AT_language)
                    fprintf(stdout, "\t%s\n", dwarf_get_LANG_name(data));
                  else
                    fprintf(stdout, "\t%#x\n", data);
              } break;

            case DW_FORM_ref8:
              {
                  uint64_t data = *(uint64_t*)p;
                  p += 8;
                  fprintf(stdout, "\t%#x\n", data);
              } break;
            case DW_FORM_sec_offset: // 0x17
              {
                  // TODO: 32 addresses
                  // TODO: read more about this. Could both files exist at the same time?
                  // NOTE: objdump does not seem to try reading the string, instead outputs the pointer value
                  // read offset into .debug_rnglists or .debug_loclists of desired string
                  uint64_t str_offset = *(uint64_t*)p;
                  p+= 8;
                  uint64_t section_offset = get_offset_of_section(file, ".debug_rnglists");
                  char *string = (char*)file + section_offset + str_offset;
                  fprintf(stdout, "\t%s\n", string);
              } break;
            case DW_FORM_exprloc: // 0x18
              {
                  uint64_t length;
                   p += decode_leb128((uint8_t*)p, &length);
                   // TODO: save this data
                   p += length;
                   fprintf(stdout, "(%d bytes data)\n");
              } break;
            case DW_FORM_flag_present:
              {
                  // Nothing to read here, flag_present just indicates that a flag is ON. We output '1' just like objdump does
                  fprintf(stdout, "\t1\n");
              } break;
            case DW_FORM_implicit_const:
              {
                  // We already have the value, do not advance the pointer at all
                  fprintf(stdout, "\t%#x\n", abbrev_table.specs[code-1][i].value);
              } break;
            case DW_FORM_strp: // 0x0e
              {
                  // TODO: 32 addresses
                  // read offset into .debug_str of desired string
                  uint64_t str_offset = *(uint64_t*)p;
                  p+= 8;
                  uint64_t section_offset = get_offset_of_section(file, ".debug_str");
                  char *string = (char*)file + section_offset + str_offset;
                  fprintf(stdout, "\t%s\n", string);
              } break;

            case DW_FORM_line_strp: // 0x1f
              {
                  // TODO: 32 addresses
                  // read offset into .debug_line_str of desired string
                  uint64_t str_offset = *(uint64_t*)p;
                  p+= 8;
                  uint64_t section_offset = get_offset_of_section(file, ".debug_line_str");
                  char *string = (char*)file + section_offset + str_offset;
                  fprintf(stdout, "\t%s\n", string);
              } break;
            default:
              fprintf(stdout, "\tTODO\n");
              break;
            }
        }
        fprintf(stdout, "\n");
    }
}

void parse_aranges_section(const void *file, uint64_t offset)
{
    debug_aranges_header_t *headerinfo = (debug_aranges_header_t*)((char*)file + offset);
    fprintf(stdout, ".debug_aranges info:\n");
    fprintf(stdout, "\tunit_length: %d\n", headerinfo->unit_length);
    fprintf(stdout, "\tversion: %d\n", headerinfo->version);
    fprintf(stdout, "\tdebug_info_offset: %d\n", headerinfo->debug_info_offset);
    fprintf(stdout, "\taddress_size: %d\n", headerinfo->address_size);
    fprintf(stdout, "\tsegment_selector_size: %d\n", headerinfo->segment_selector_size);
    // TODO: Deal with more than one address
    cu_header_offset = headerinfo->debug_info_offset;

    fprintf(stdout, "\tADDRESS\tLENGTH:\n");
    char *p = (char*)headerinfo;
    int padding_boundary = headerinfo->segment_selector_size + headerinfo->address_size * 2;
    int curr_padding = sizeof(debug_aranges_header_t);
    while (curr_padding % padding_boundary)
        ++curr_padding;
    p = p + curr_padding;
    while (1)
    {
        // TODO: dwarf32
        if (headerinfo->address_size != 8)
        {
            fprintf(stderr, "Address_size != 8 not implemented yet");
            exit(1);
        }
        if (headerinfo->segment_selector_size)
        {
            fprintf(stderr, "Segment selectors are not implemented yet");
            exit(1);
        }
        uint64_t address = *((uint64_t*)p);
        fprintf(stdout, "\t%x", address);
        p = p + headerinfo->address_size;
        uint64_t length = *((uint64_t*)p);
        fprintf(stdout, "\t%x\n", length);
        p = p + headerinfo->address_size;

        if (!address && !length)
            break;
    }
}

uint64_t get_alignment_of_section(const void *file, const char *name)
{
    Elf64_Ehdr *elf_header = (Elf64_Ehdr *)file;
    Elf64_Shdr *elf_section_table = (Elf64_Shdr *)((char*)file + elf_header->e_shoff);
    Elf64_Half strtab_index = elf_header->e_shstrndx;

    Elf64_Shdr* strtable_header = (elf_section_table + strtab_index);
    char *strings = (char*)file + (ptrdiff_t)strtable_header->sh_offset;

    Elf64_Half nheaders = elf_header->e_shnum;
    for (int i = 0; i < nheaders; ++i)
    {
        if (elf_section_table[i].sh_type == SHT_NULL)
            continue;

        Elf64_Off str_idx = elf_section_table[i].sh_name;
        char *str = &strings[str_idx];

        if (!strcmp(str, name))
            return elf_section_table[i].sh_addralign;
    }
    return -1;
}

uint64_t get_offset_of_section(const void *file, const char *name)
{
    Elf64_Ehdr *elf_header = (Elf64_Ehdr *)file;
    Elf64_Shdr *elf_section_table = (Elf64_Shdr *)((char*)file + elf_header->e_shoff);
    Elf64_Half strtab_index = elf_header->e_shstrndx;

    Elf64_Shdr* strtable_header = (elf_section_table + strtab_index);
    char *strings = (char*)file + (ptrdiff_t)strtable_header->sh_offset;

    Elf64_Half nheaders = elf_header->e_shnum;
    for (int i = 0; i < nheaders; ++i)
    {
        if (elf_section_table[i].sh_type == SHT_NULL)
            continue;

        Elf64_Off str_idx = elf_section_table[i].sh_name;
        char *str = &strings[str_idx];

        if (!strcmp(str, name))
            return elf_section_table[i].sh_offset;
    }
    return -1;
}

void parse_section_names(const void *file, Elf64_Shdr* table, int n, int strtab_index)
{
    Elf64_Shdr* strtable = (table + strtab_index);
    char *strings = (char*)file + (ptrdiff_t)strtable->sh_offset;
    for (int i = 0; i < n; ++i)
    {
        if (table[i].sh_type == SHT_NULL || table[i].sh_type == SHT_SHLIB)
          continue;
        Elf64_Off str_idx = table[i].sh_name;
        fprintf(stdout, "Section #%d: %s\n", i, &strings[str_idx]);
    }
}

void print_header(Elf64_Ehdr *header)
{
    fprintf(stdout, "ELF identification:\n");
    fprintf(stdout, "\tEL_MAG0: %#13x\n", header->e_ident[0]);
    fprintf(stdout, "\tEL_MAG1: %10c\n", header->e_ident[1]);
    fprintf(stdout, "\tEL_MAG2: %10c\n", header->e_ident[2]);
    fprintf(stdout, "\tEL_MAG3: %10c\n", header->e_ident[3]);
    fprintf(stdout, "\tEL_CLASS: %9d (ELFCLASS%d)\n", header->e_ident[4], header->e_ident[4] * 32);
    fprintf(stdout, "\tEL_DATA: %10d (ELFDATA2%cSB)\n", header->e_ident[5], header->e_ident[5] == 1 ? 'L' : 'M');
    fprintf(stdout, "\tEL_VERSION: %7d (%s)\n", header->e_ident[6], header->e_ident[6] == 1 ? "EV_CURRENT" : "?");
    fprintf(stdout, "\tEL_OSABI: %9d (ELFOSABI_%s)\n", header->e_ident[7], header->e_ident[7] == 0 ? "SYSV" : header->e_ident[7] == 1 ? "HPUX" : "STANDALONE");
    fprintf(stdout, "\tEL_ABIVERSION: %4d\n", header->e_ident[8]);
    fprintf(stdout, "\tEL_PAD:           /* Padding bytes */\n");
    fprintf(stdout, "\tEL_NIDENT: %8d\n", header->e_ident[15]);

    fprintf(stdout, "Object file type: ");

#define ET_CASE(name) case name: fprintf(stdout, "(" #name ")\n"); break
    switch (header->e_type)
    {
        ET_CASE(ET_NONE);
        ET_CASE(ET_REL);
        ET_CASE(ET_EXEC);
        ET_CASE(ET_DYN);
        ET_CASE(ET_CORE);
        ET_CASE(ET_LOOS);
        ET_CASE(ET_HIOS);
        ET_CASE(ET_LOPROC);
        ET_CASE(ET_HIPROC);
    }
#undef ET_CASE

    fprintf(stdout, "Machine type: %#x\n", header->e_machine);
    fprintf(stdout, "Object file version: %d (%s)\n", header->e_version, header->e_version == 1 ? "EV_CURRENT" : "?");
    fprintf(stdout, "Entry point address: %#x:\n", header->e_entry);
    fprintf(stdout, "Program header offset: %d:\n", header->e_phoff);
    fprintf(stdout, "Section header offset: %d:\n", header->e_shoff);
    fprintf(stdout, "Processor-specific flags: %#x\n", header->e_flags);
    fprintf(stdout, "ELF header size: %d\n", header->e_ehsize);
    fprintf(stdout, "Program header entry size: %d\n", header->e_phentsize);
    fprintf(stdout, "# of program header entries: %d\n", header->e_phnum);
    fprintf(stdout, "Size of section header entry: %d\n", header->e_shentsize);
    fprintf(stdout, "# of section header entries: %d\n", header->e_shnum);
    fprintf(stdout, "Section name string table index: %d\n", header->e_shstrndx);
}

int main(int argc, char *argv[])
{
    if (argc < 2)
    {
        fprintf(stdout, "Usage: pesticide [ELF binary path]\n");
        return 1;
    }
    fprintf(stdout, "Trying to read %s\n", argv[1]);
    int fd = open(argv[1], O_RDONLY);
    if (fd < 0)
    {
        fprintf(stdout, "Error trying to read %s\n", argv[1]);
        return 1;
    }
    struct stat st;
    fstat(fd, &st);
    fprintf(stdout, "MMapping %d bytes\n", st.st_size);
    void *addr = mmap(NULL, st.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
    if (addr == MAP_FAILED)
    {
        fprintf(stdout, "Error MMapping %s\n", argv[1]);
        return 1;
    }

    Elf64_Ehdr *elf_header = (Elf64_Ehdr *)addr;
    print_header(elf_header);

    Elf64_Shdr *elf_section_table = (Elf64_Shdr *)((char*)addr + elf_header->e_shoff);
    parse_section_names(addr, elf_section_table, elf_header->e_shnum, elf_header->e_shstrndx);
    fprintf(stdout, "Offset of section .debug_aranges is %d\n", get_offset_of_section(addr, ".debug_aranges"));
    fprintf(stdout, "Byte alignment of section .debug_aranges is %d\n", get_alignment_of_section(addr, ".debug_aranges"));
    parse_aranges_section(addr, get_offset_of_section(addr, ".debug_aranges"));
    parse_debuginfo_section(addr);
}