From 34a2e860efda4d585503202b6019f9deec1acf7d Mon Sep 17 00:00:00 2001
From: Alexander Lantsev <aunsane@gmail.com>
Date: Tue, 16 Feb 2016 11:29:23 +0000
Subject: git-svn-id: http://svn.miranda-ng.org/main/trunk@16284
 1316c22d-e87f-b044-9b9b-93d7a3e3ba9c

---
 plugins/MirLua/Modules/luaffi/src/call_x86.dasc | 1594 +++++++++++++++++++++++
 1 file changed, 1594 insertions(+)
 create mode 100644 plugins/MirLua/Modules/luaffi/src/call_x86.dasc

(limited to 'plugins/MirLua/Modules/luaffi/src/call_x86.dasc')

diff --git a/plugins/MirLua/Modules/luaffi/src/call_x86.dasc b/plugins/MirLua/Modules/luaffi/src/call_x86.dasc
new file mode 100644
index 0000000000..fa8271a2a1
--- /dev/null
+++ b/plugins/MirLua/Modules/luaffi/src/call_x86.dasc
@@ -0,0 +1,1594 @@
+/* vim: ts=4 sw=4 sts=4 et tw=78
+ * Copyright (c) 2011 James R. McKaskill. See license in ffi.h
+ */
+|.if X64
+|.arch x64
+|.else
+|.arch x86
+|.endif
+
+|.actionlist build_actionlist
+|.globalnames globnames
+|.externnames extnames
+
+|.if not X64
+|.define RET_H, edx // for int64_t returns
+|.define RET_L, eax
+|.endif
+
+|.if X64WIN
+|
+|.macro call_rrrp, func, arg0, arg1, arg2, arg3
+| mov64 r9, arg3
+| mov r8, arg2
+| mov rdx, arg1
+| mov rcx, arg0
+| call func
+|.endmacro
+|.macro call_rrrr, func, arg0, arg1, arg2, arg3
+| mov r9, arg3
+| mov r8, arg2
+| mov rdx, arg1
+| mov rcx, arg0
+| call func
+|.endmacro
+|
+|.macro call_rrp, func, arg0, arg1, arg2
+| mov64 r8, arg2
+| mov rdx, arg1
+| mov rcx, arg0
+| call func
+|.endmacro
+|.macro call_rrr, func, arg0, arg1, arg2
+| mov r8, arg2
+| mov rdx, arg1
+| mov rcx, arg0
+| call func
+|.endmacro
+|
+|.macro call_rp, func, arg0, arg1
+| mov64 rdx, arg1
+| mov rcx, arg0
+| call func
+|.endmacro
+|.macro call_rr, func, arg0, arg1
+| mov rdx, arg1
+| mov rcx, arg0
+| call func
+|.endmacro
+|
+|.macro call_r, func, arg0
+| mov rcx, arg0
+| call func
+|.endmacro
+|
+|.elif X64
+|
+| // the 5 and 6 arg forms are only used on posix x64
+|.macro call_rrrrrr, func, arg0, arg1, arg2, arg3, arg4, arg5
+| mov r9, arg5
+| mov r8, arg4
+| mov rcx, arg3
+| mov rdx, arg2
+| mov rsi, arg1
+| mov rdi, arg0
+| call func
+|.endmacro
+|.macro call_rrrrr, func, arg0, arg1, arg2, arg3, arg4
+| mov r8, arg4
+| mov rcx, arg3
+| mov rdx, arg2
+| mov rsi, arg1
+| mov rdi, arg0
+| call func
+|.endmacro
+|
+|.macro call_rrrp, func, arg0, arg1, arg2, arg3
+| mov64 rcx, arg3
+| mov rdx, arg2
+| mov rsi, arg1
+| mov rdi, arg0
+| call func
+|.endmacro
+|.macro call_rrrr, func, arg0, arg1, arg2, arg3
+| mov rcx, arg3
+| mov rdx, arg2
+| mov rsi, arg1
+| mov rdi, arg0
+| call func
+|.endmacro
+|
+|.macro call_rrp, func, arg0, arg1, arg2
+| mov64 rdx, arg2
+| mov rsi, arg1
+| mov rdi, arg0
+| call func
+|.endmacro
+|.macro call_rrr, func, arg0, arg1, arg2
+| mov rdx, arg2
+| mov rsi, arg1
+| mov rdi, arg0
+| call func
+|.endmacro
+|
+|.macro call_rp, func, arg0, arg1
+| mov64 rsi, arg1
+| mov rdi, arg0
+| call func
+|.endmacro
+|.macro call_rr, func, arg0, arg1
+| mov rsi, arg1
+| mov rdi, arg0
+| call func
+|.endmacro
+|
+|.macro call_r, func, arg0
+| mov rdi, arg0
+| call func
+|.endmacro
+|
+|.else
+| // define the 64bit registers to the 32 bit counterparts, so the common
+| // code can use r*x for all pointers
+|.define rax, eax
+|.define rcx, ecx
+|.define rdx, edx
+|.define rsp, esp
+|.define rbp, ebp
+|.define rdi, edi
+|.define rsi, esi
+|.define mov64, mov
+|
+|.macro call_rrrr, func, arg0, arg1, arg2, arg3
+| mov dword [rsp+12], arg3
+| mov dword [rsp+8], arg2
+| mov dword [rsp+4], arg1
+| mov dword [rsp], arg0
+| call func
+|.endmacro
+|.macro call_rrr, func, arg0, arg1, arg2
+| mov dword [rsp+8], arg2
+| mov dword [rsp+4], arg1
+| mov dword [rsp], arg0
+| call func
+|.endmacro
+|.macro call_rr, func, arg0, arg1
+| mov dword [rsp+4], arg1
+| mov dword [rsp], arg0
+| call func
+|.endmacro
+|.macro call_r, func, arg0
+| mov dword [rsp], arg0
+| call func
+|.endmacro
+|
+|.define call_rrrp, call_rrrr
+|.define call_rrp, call_rrr
+|.define call_rp, call_rr
+|
+|.endif
+
+#if defined _WIN64 || defined __amd64__
+#define JUMP_SIZE 14
+#else
+#define JUMP_SIZE 4
+#endif
+
+#define MIN_BRANCH INT32_MIN
+#define MAX_BRANCH INT32_MAX
+#define BRANCH_OFF 4
+
+static void compile_extern_jump(struct jit* jit, lua_State* L, cfunction func, uint8_t* code)
+{
+    /* The jump code is the function pointer followed by a stub to call the
+     * function pointer. The stub exists in 64 bit so we can jump to functions
+     * with an offset greater than 2 GB.
+     *
+     * Note we have to manually set this up since there are commands buffered
+     * in the jit state and dynasm doesn't support rip relative addressing.
+     *
+     * eg on 64 bit:
+     * 0-8: function ptr
+     * 8-14: jmp aword [rip-14]
+     *
+     * for 32 bit we only set the function ptr as it can always fit in a 32
+     * bit displacement
+     */
+#if defined _WIN64 || defined __amd64__
+    *(cfunction*) code = func;
+    code[8] = 0xFF; /* FF /4 operand for jmp */
+    code[9] = 0x25; /* RIP displacement */
+    *(int32_t*) &code[10] = -14;
+#else
+    *(cfunction*) code = func;
+#endif
+}
+
+void compile_globals(struct jit* jit, lua_State* L)
+{
+    struct jit* Dst = jit;
+    int* perr = &jit->last_errno;
+    dasm_setup(Dst, build_actionlist);
+
+    /* Note: since the return code uses EBP to reset the stack pointer, we
+     * don't have to track the amount of stack space used. It also means we
+     * can handle stdcall and cdecl with the same code.
+     */
+
+    /* Note the various call_* functions want 32 bytes of 16 byte aligned
+     * stack
+     */
+
+    |.if X64
+    |.define L_ARG, r12
+    |.define TOP, r13
+    |.else
+    |.define L_ARG, rdi
+    |.define TOP, rsi
+    |.endif
+
+    |.macro epilog
+    |.if X64
+    | mov TOP, [rbp-16]
+    | mov L_ARG, [rbp-8]
+    |.else
+    | mov TOP, [rbp-8]
+    | mov L_ARG, [rbp-4]
+    |.endif
+    | mov rsp, rbp
+    | pop rbp
+    | ret
+    |.endmacro
+
+    |.macro get_errno // note trashes registers
+    | call extern GetLastError
+    | mov64 rcx, perr
+    | mov dword [rcx], eax
+    |.endmacro
+
+    /* the general idea for the return functions is:
+     * 1) Save return value on stack
+     * 2) Call get_errno (this trashes the registers hence #1)
+     * 3) Unpack return value from stack
+     * 4) Call lua push function
+     * 5) Set eax to number of returned args (0 or 1)
+     * 6) Call return which pops our stack frame
+     */
+
+    |->lua_return_arg:
+    | mov eax, 1
+    | epilog
+
+    |->lua_return_void:
+    | get_errno
+    | mov eax, 0
+    | epilog
+
+    |->lua_return_double:
+    |.if X64
+    | movq qword [rsp+32], xmm0
+    |.else
+    | fstp qword [rsp+4] // note get_errno doesn't require any stack on x86
+    |.endif
+    |
+    | get_errno
+    |
+    |.if X64WIN
+    | movq xmm1, qword [rsp+32]
+    | mov rcx, L_ARG
+    |.elif X64
+    | movq xmm0, qword [rsp+32]
+    | mov rdi, L_ARG
+    |.else
+    | mov [rsp], L_ARG
+    |.endif
+    | call extern lua_pushnumber
+    | jmp ->lua_return_arg
+
+    |->lua_return_bool:
+    | movzx eax, al
+    | mov [rsp+32], eax
+    | get_errno
+    | mov eax, [rsp+32]
+    | call_rr extern lua_pushboolean, L_ARG, rax
+    | jmp ->lua_return_arg
+
+    |->lua_return_int:
+    | mov [rsp+32], eax
+    | get_errno
+    | mov eax, [rsp+32]
+    | call_rr extern push_int, L_ARG, rax
+    | jmp ->lua_return_arg
+
+    |->lua_return_uint:
+    | mov [rsp+32], eax
+    | get_errno
+    | mov eax, [rsp+32]
+    | call_rr extern push_uint, L_ARG, rax
+    | jmp ->lua_return_arg
+
+    |->too_few_arguments:
+    | mov ax, 0
+    | call_rp extern luaL_error, L_ARG, &"too few arguments"
+
+    |->too_many_arguments:
+    | mov ax, 0
+    | call_rp extern luaL_error, L_ARG, &"too many arguments"
+
+    |->save_registers:
+    | // use rbp relative so we store values in the outer stack frame
+    |.if X64WIN
+    | // use the provided shadow space for int registers above prev rbp and
+    | // return address
+    | mov [rbp+16], rcx
+    | mov [rbp+24], rdx
+    | mov [rbp+32], r8
+    | mov [rbp+40], r9
+    | // use the extra space we added for float registers
+    | // -16 to store underneath previous value of L_ARG
+    | movq qword [rbp-16], xmm0
+    | movq qword [rbp-24], xmm1
+    | movq qword [rbp-32], xmm2
+    | movq qword [rbp-40], xmm3
+    |.elif X64
+    | movq qword [rbp-16], xmm0
+    | movq qword [rbp-24], xmm1
+    | movq qword [rbp-32], xmm2
+    | movq qword [rbp-40], xmm3
+    | movq qword [rbp-48], xmm4
+    | movq qword [rbp-56], xmm5
+    | movq qword [rbp-64], xmm6
+    | movq qword [rbp-72], xmm7
+    | mov [rbp-80], rdi
+    | mov [rbp-88], rsi
+    | mov [rbp-96], rdx
+    | mov [rbp-104], rcx
+    | mov [rbp-112], r8
+    | mov [rbp-120], r9
+    |.else
+    | // fastcall, -8 to store underneath previous value of L_ARG
+    | mov [rbp-8], ecx
+    | mov [rbp-12], edx
+    |.endif
+    | ret
+
+    compile(Dst, L, NULL, LUA_NOREF);
+}
+
+int x86_return_size(lua_State* L, int usr, const struct ctype* ct)
+{
+    int ret = 0;
+    const struct ctype* mt;
+
+    if (ct->calling_convention != C_CALL) {
+        size_t i;
+        size_t argn = lua_rawlen(L, usr);
+        for (i = 1; i <= argn; i++) {
+            lua_rawgeti(L, usr, (int) i);
+            mt = (const struct ctype*) lua_touserdata(L, -1);
+
+            if (mt->pointers) {
+                ret += sizeof(void*);
+            } else {
+                switch (mt->type) {
+                case DOUBLE_TYPE:
+                case COMPLEX_FLOAT_TYPE:
+                case INT64_TYPE:
+                    ret += 8;
+                    break;
+                case COMPLEX_DOUBLE_TYPE:
+                    ret += 16;
+                    break;
+                case INTPTR_TYPE:
+                    ret += sizeof(intptr_t);
+                    break;
+                case FUNCTION_PTR_TYPE:
+                    ret += sizeof(cfunction);
+                    break;
+                case BOOL_TYPE:
+                case FLOAT_TYPE:
+                case INT8_TYPE:
+                case INT16_TYPE:
+                case INT32_TYPE:
+                case ENUM_TYPE:
+                    ret += 4;
+                    break;
+                default:
+                    return luaL_error(L, "NYI - argument type");
+                }
+            }
+
+            lua_pop(L, 1);
+        }
+    }
+
+#if !defined _WIN64 && !defined __amd64__
+    lua_rawgeti(L, usr, 0);
+    mt = (const struct ctype*) lua_touserdata(L, -1);
+    if (!mt->pointers && mt->type == COMPLEX_DOUBLE_TYPE) {
+        ret += sizeof(void*);
+    }
+    lua_pop(L, 1);
+#endif
+
+    return ret;
+}
+
+#ifdef _WIN64
+#define MAX_REGISTERS(ct) 4 /* rcx, rdx, r8, r9 */
+
+#elif defined __amd64__
+#define MAX_INT_REGISTERS(ct) 6 /* rdi, rsi, rdx, rcx, r8, r9 */
+#define MAX_FLOAT_REGISTERS(ct) 8 /* xmm0-7 */
+
+#else
+#define MAX_INT_REGISTERS(ct) ((ct)->calling_convention == FAST_CALL ? 2 /* ecx, edx */ : 0)
+#define MAX_FLOAT_REGISTERS(ct) 0
+#endif
+
+struct reg_alloc {
+#ifdef _WIN64
+    int regs;
+    int is_float[4];
+    int is_int[4];
+#else
+    int floats;
+    int ints;
+#endif
+    int off;
+};
+
+#ifdef _WIN64
+#define REGISTER_STACK_SPACE(ct) (4*8)
+#elif defined __amd64__
+#define REGISTER_STACK_SPACE(ct) (14*8)
+#else
+#define REGISTER_STACK_SPACE(ct) ALIGN_UP(((ct)->calling_convention == FAST_CALL ? 2*4 : 0), 15)
+#endif
+
+/* Fastcall:
+ * Uses ecx, edx as first two int registers
+ * Everything else on stack (include 64bit ints)
+ * No overflow stack space
+ * Pops the stack before returning
+ * Returns int in eax, float in ST0
+ * We use the same register allocation logic as posix x64 with 2 int regs and 0 float regs
+ */
+
+static void get_int(Dst_DECL, const struct ctype* ct, struct reg_alloc* reg, int is_int64)
+{
+    /* grab the register from the shadow space */
+#ifdef _WIN64
+    if (reg->regs < MAX_REGISTERS(ct)) {
+        | mov rcx, [rbp + 16 + 8*reg->regs]
+        reg->regs++;
+    }
+#elif __amd64__
+    if (reg->ints < MAX_INT_REGISTERS(ct)) {
+        | mov rcx, [rbp - 80 - 8*reg->ints]
+        reg->ints++;
+    }
+#else
+    if (!is_int64 && reg->ints < MAX_INT_REGISTERS(ct)) {
+        | mov ecx, [rbp - 8 - 4*reg->ints]
+        reg->ints++;
+    }
+#endif
+    else if (is_int64) {
+        |.if X64
+        | mov rcx, [rbp + reg->off]
+        |.else
+        | mov rcx, [rbp + reg->off]
+        | mov rdx, [rbp + reg->off + 4]
+        |.endif
+        reg->off += 8;
+    } else {
+        | mov ecx, [rbp + reg->off]
+        reg->off += 4;
+    }
+}
+
+static void add_int(Dst_DECL, const struct ctype* ct, struct reg_alloc* reg, int is_int64)
+{
+#ifdef _WIN64
+    if (reg->regs < MAX_REGISTERS(ct)) {
+        | mov [rsp + 32 + 8*(reg->regs)], rax
+        reg->is_int[reg->regs++] = 1;
+    }
+#elif __amd64__
+    if (reg->ints < MAX_INT_REGISTERS(ct)) {
+        | mov [rsp + 32 + 8*reg->ints], rax
+        reg->ints++;
+    }
+#else
+    if (!is_int64 && reg->ints < MAX_INT_REGISTERS(ct)) {
+        | mov [rsp + 32 + 4*reg->ints], rax
+        reg->ints++;
+    }
+#endif
+
+    else if (is_int64) {
+        |.if X64
+        | mov [rsp + reg->off], rax
+        |.else
+        | mov [rsp + reg->off], RET_L
+        | mov [rsp + reg->off + 4], RET_H
+        |.endif
+        reg->off += 8;
+    } else {
+        | mov [rsp+reg->off], eax
+        reg->off += 4;
+    }
+}
+
+static void get_float(Dst_DECL, const struct ctype* ct, struct reg_alloc* reg, int is_double)
+{
+#if !defined _WIN64 && !defined __amd64__
+    assert(MAX_FLOAT_REGISTERS(ct) == 0);
+    if (is_double) {
+        | fld qword [rbp + reg->off]
+        reg->off += 8;
+    } else {
+        | fld dword [rbp + reg->off]
+        reg->off += 4;
+    }
+#else
+    int off;
+
+#ifdef _WIN64
+    if (reg->regs < MAX_REGISTERS(ct)) {
+        off = -16 - 8*reg->regs;
+        reg->regs++;
+    }
+#else
+    if (reg->floats < MAX_FLOAT_REGISTERS(ct)) {
+        off = -16 - 8*reg->floats;
+        reg->floats++;
+    }
+#endif
+    else {
+        off = reg->off;
+        reg->off += is_double ? 8 : 4;
+    }
+
+    if (is_double) {
+        | movq xmm0, qword [rbp + off]
+    } else {
+        | cvtss2sd xmm0, dword [rbp + off]
+    }
+#endif
+}
+
+static void add_float(Dst_DECL, const struct ctype* ct, struct reg_alloc* reg, int is_double)
+{
+#if !defined _WIN64 && !defined __amd64__
+    assert(MAX_FLOAT_REGISTERS(ct) == 0);
+    if (is_double) {
+        | fstp qword [rsp + reg->off]
+        reg->off += 8;
+    } else {
+        | fstp dword [rsp + reg->off]
+        reg->off += 4;
+    }
+#else
+
+#ifdef _WIN64
+    if (reg->regs < MAX_REGISTERS(ct)) {
+        if (is_double) {
+            | movq qword [rsp + 32 + 8*(reg->regs)], xmm0
+        } else {
+            | cvtsd2ss xmm0, xmm0
+            | movq qword [rsp + 32 + 8*(reg->regs)], xmm0
+        }
+        reg->is_float[reg->regs++] = 1;
+    }
+#else
+    if (reg->floats < MAX_FLOAT_REGISTERS(ct)) {
+        if (is_double) {
+            | movq qword [rsp + 32 + 8*(MAX_INT_REGISTERS(ct) + reg->floats)], xmm0
+        } else {
+            | cvtsd2ss xmm0, xmm0
+            | movq qword [rsp + 32 + 8*(MAX_INT_REGISTERS(ct) + reg->floats)], xmm0
+        }
+        reg->floats++;
+    }
+#endif
+
+    else if (is_double) {
+        | movq qword [rsp + reg->off], xmm0
+        reg->off += 8;
+    } else {
+        | cvtsd2ss xmm0, xmm0
+        | movd dword [rsp + reg->off], xmm0
+        reg->off += 4;
+    }
+#endif
+}
+
+#if defined _WIN64 || defined __amd64__
+#define add_pointer(jit, ct, reg) add_int(jit, ct, reg, 1)
+#define get_pointer(jit, ct, reg) get_int(jit, ct, reg, 1)
+#else
+#define add_pointer(jit, ct, reg) add_int(jit, ct, reg, 0)
+#define get_pointer(jit, ct, reg) get_int(jit, ct, reg, 0)
+#endif
+
+cfunction compile_callback(lua_State* L, int fidx, int ct_usr, const struct ctype* ct)
+{
+    int i, nargs;
+    cfunction* pf;
+    struct ctype ct2 = *ct;
+    const struct ctype* mt;
+    struct reg_alloc reg;
+    int num_upvals = 0;
+    int top = lua_gettop(L);
+    struct jit* Dst = get_jit(L);
+    int ref;
+    int hidden_arg_off = 0;
+
+    ct_usr = lua_absindex(L, ct_usr);
+    fidx = lua_absindex(L, fidx);
+
+    assert(lua_isnil(L, fidx) || lua_isfunction(L, fidx));
+
+    memset(&reg, 0, sizeof(reg));
+#ifdef _WIN64
+    reg.off = 16 + REGISTER_STACK_SPACE(ct); /* stack registers are above the shadow space */
+#elif __amd64__
+    reg.off = 16;
+#else
+    reg.off = 8;
+#endif
+
+    dasm_setup(Dst, build_actionlist);
+
+    // add a table to store ctype and function upvalues
+    // callback_set assumes the first value is the lua function
+    nargs = (int) lua_rawlen(L, ct_usr);
+    lua_newtable(L);
+    lua_pushvalue(L, -1);
+    ref = luaL_ref(L, LUA_REGISTRYINDEX);
+
+    if (ct->has_var_arg) {
+        luaL_error(L, "can't create callbacks with varargs");
+    }
+
+    // setup a stack frame to hold args for the call into lua_call
+
+    | push rbp
+    | mov rbp, rsp
+    | push L_ARG
+    | // stack is 4 or 8 (mod 16) (L_ARG, rbp, rip)
+    |.if X64
+    | // 8 to realign, 16 for return vars, 32 for local calls, rest to save registers
+    | sub rsp, 8 + 16 + 32 + REGISTER_STACK_SPACE(ct)
+    | call ->save_registers
+    |.else
+    | // 4 to realign, 16 for return vars, 32 for local calls, rest to save registers
+    | sub rsp, 4 + 16 + 32 + REGISTER_STACK_SPACE(ct)
+    if (ct->calling_convention == FAST_CALL) {
+        | call ->save_registers
+    }
+    |.endif
+
+    // hardcode the lua_State* value into the assembly
+    | mov64 L_ARG, L
+
+    /* get the upval table */
+    | call_rrr extern lua_rawgeti, L_ARG, LUA_REGISTRYINDEX, ref
+
+    /* get the lua function */
+    lua_pushvalue(L, fidx);
+    lua_rawseti(L, -2, ++num_upvals);
+    assert(num_upvals == CALLBACK_FUNC_USR_IDX);
+    | call_rrr extern lua_rawgeti, L_ARG, -1, num_upvals
+
+#if !defined _WIN64 && !defined __amd64__
+    lua_rawgeti(L, ct_usr, 0);
+    mt = (const struct ctype*) lua_touserdata(L, -1);
+    if (!mt->pointers && mt->type == COMPLEX_DOUBLE_TYPE) {
+        hidden_arg_off = reg.off;
+        reg.off += sizeof(void*);
+    }
+    lua_pop(L, 1);
+#else
+    (void) hidden_arg_off;
+#endif
+
+    for (i = 1; i <= nargs; i++) {
+        lua_rawgeti(L, ct_usr, i);
+        mt = (const struct ctype*) lua_touserdata(L, -1);
+
+        if (mt->pointers) {
+            lua_getuservalue(L, -1);
+            lua_rawseti(L, -3, ++num_upvals); /* usr value */
+            lua_rawseti(L, -2, ++num_upvals); /* mt */
+            /* on the lua stack in the callback:
+             * upval tbl, lua func, i-1 args
+             */
+            | call_rrr extern lua_rawgeti, L_ARG, -i-1, num_upvals-1
+            | call_rrp extern push_cdata, L_ARG, -1, mt
+            get_pointer(Dst, ct, &reg);
+            | mov [rax], rcx
+            | call_rr, extern lua_remove, L_ARG, -2
+        } else {
+            switch (mt->type) {
+            case INT64_TYPE:
+                lua_getuservalue(L, -1);
+                lua_rawseti(L, -3, ++num_upvals); /* mt */
+                lua_pop(L, 1);
+                | call_rrp extern push_cdata, L_ARG, 0, mt
+                get_int(Dst, ct, &reg, 1);
+                |.if X64
+                | mov [rax], rcx
+                |.else
+                | mov [rax], ecx
+                | mov [rax+4], edx
+                |.endif
+                break;
+
+            case INTPTR_TYPE:
+                lua_getuservalue(L, -1);
+                lua_rawseti(L, -3, ++num_upvals); /* mt */
+                lua_pop(L, 1);
+                | call_rrp extern push_cdata, L_ARG, 0, mt
+                get_pointer(Dst, ct, &reg);
+                | mov [rax], rcx
+                break;
+
+            case COMPLEX_FLOAT_TYPE:
+                lua_pop(L, 1);
+#if defined _WIN64 || defined __amd64__
+                /* complex floats are two floats packed into a double */
+                | call_rrp extern push_cdata, L_ARG, 0, mt
+                get_float(Dst, ct, &reg, 1);
+                | movq qword [rax], xmm0
+#else
+                /* complex floats are real followed by imag on the stack */
+                | call_rrp extern push_cdata, L_ARG, 0, mt
+                get_float(Dst, ct, &reg, 0);
+                | fstp dword [rax]
+                get_float(Dst, ct, &reg, 0);
+                | fstp dword [rax+4]
+#endif
+                break;
+
+            case COMPLEX_DOUBLE_TYPE:
+                lua_pop(L, 1);
+                | call_rrp extern push_cdata, L_ARG, 0, mt
+                /* real */
+                get_float(Dst, ct, &reg, 1);
+                |.if X64
+                | movq qword [rax], xmm0
+                |.else
+                | fstp qword [rax]
+                |.endif
+                /* imag */
+                get_float(Dst, ct, &reg, 1);
+                |.if X64
+                | movq qword [rax+8], xmm0
+                |.else
+                | fstp qword [rax+8]
+                |.endif
+                break;
+
+            case FLOAT_TYPE:
+            case DOUBLE_TYPE:
+                lua_pop(L, 1);
+                get_float(Dst, ct, &reg, mt->type == DOUBLE_TYPE);
+                |.if X64WIN
+                | movq xmm1, xmm0
+                | mov rcx, L_ARG
+                |.elif X64
+                | // for 64bit xmm0 is already set
+                | mov rdi, L_ARG
+                |.else
+                | fstp qword [rsp+4]
+                | mov [rsp], L_ARG
+                |.endif
+                | call extern lua_pushnumber
+                break;
+
+            case BOOL_TYPE:
+                lua_pop(L, 1);
+                get_int(Dst, ct, &reg, 0);
+                | movzx ecx, cl
+                | call_rr extern lua_pushboolean, L_ARG, rcx
+                break;
+
+            case INT8_TYPE:
+                lua_pop(L, 1);
+                get_int(Dst, ct, &reg, 0);
+                if (mt->is_unsigned) {
+                    | movzx ecx, cl
+                } else {
+                    | movsx ecx, cl
+                }
+                | call_rr extern push_int, L_ARG, rcx
+                break;
+
+            case INT16_TYPE:
+                lua_pop(L, 1);
+                get_int(Dst, ct, &reg, 0);
+                if (mt->is_unsigned) {
+                    | movzx ecx, cx
+                } else {
+                    | movsx ecx, cx
+                }
+                | call_rr extern push_int, L_ARG, rcx
+                break;
+
+            case ENUM_TYPE:
+            case INT32_TYPE:
+                lua_pop(L, 1);
+                get_int(Dst, ct, &reg, 0);
+                if (mt->is_unsigned) {
+                    | call_rr extern push_uint, L_ARG, rcx
+                } else {
+                    | call_rr extern push_int, L_ARG, rcx
+                }
+                break;
+
+            default:
+                luaL_error(L, "NYI: callback arg type");
+            }
+        }
+    }
+
+    lua_rawgeti(L, ct_usr, 0);
+    mt = (const struct ctype*) lua_touserdata(L, -1);
+
+    | call_rrrp extern lua_callk, L_ARG, nargs, (mt->pointers || mt->type != VOID_TYPE) ? 1 : 0, 0
+
+    // Unpack the return argument if not "void", also clean-up the lua stack
+    // to remove the return argument and bind table. Use lua_settop rather
+    // than lua_pop as lua_pop is implemented as a macro.
+    if (mt->pointers) {
+        lua_getuservalue(L, -1);
+        lua_rawseti(L, -3, ++num_upvals); /* usr value */
+        lua_rawseti(L, -2, ++num_upvals); /* mt */
+        | call_rrr extern lua_rawgeti, L_ARG, -2, num_upvals-1
+        | call_rrrp extern check_typed_pointer, L_ARG, -2, -1, mt
+        | mov [rsp+32], rax
+        | call_rr extern lua_settop, L_ARG, -4
+        | mov rax, [rsp+32]
+
+    } else {
+        switch (mt->type) {
+        case ENUM_TYPE:
+            lua_getuservalue(L, -1);
+            lua_rawseti(L, -3, ++num_upvals); /* usr value */
+            lua_rawseti(L, -2, ++num_upvals); /* mt */
+            | call_rrr extern lua_rawgeti, L_ARG, -2, num_upvals-1
+            | call_rrrp, extern check_enum, L_ARG, -2, -1, mt
+            | mov [rsp+32], eax
+            | call_rr extern lua_settop, L_ARG, -4
+            | mov eax, [rsp+32]
+            break;
+
+        case VOID_TYPE:
+            lua_pop(L, 1);
+            | call_rr extern lua_settop, L_ARG, -2
+            break;
+
+        case BOOL_TYPE:
+        case INT8_TYPE:
+        case INT16_TYPE:
+        case INT32_TYPE:
+            lua_pop(L, 1);
+            if (mt->is_unsigned) {
+                | call_rr extern check_uint32, L_ARG, -1
+            } else {
+                | call_rr extern check_int32, L_ARG, -1
+            }
+            | mov [rsp+32], eax
+            | call_rr extern lua_settop, L_ARG, -3
+            | mov eax, [rsp+32]
+            break;
+
+        case INT64_TYPE:
+            lua_pop(L, 1);
+
+            if (mt->is_unsigned) {
+                | call_rr extern check_uint64, L_ARG, -1
+            } else {
+                | call_rr extern check_int64, L_ARG, -1
+            }
+
+            |.if X64
+            | mov [rsp+32], rax
+            |.else
+            | mov [rsp+32], RET_L
+            | mov [rsp+36], RET_H
+            |.endif
+            | call_rr extern lua_settop, L_ARG, -3
+            |.if X64
+            | mov rax, [rsp+32]
+            |.else
+            | mov RET_L, [rsp+32]
+            | mov RET_H, [rsp+36]
+            |.endif
+            break;
+
+        case INTPTR_TYPE:
+            lua_pop(L, 1);
+            | call_rr extern check_uintptr, L_ARG, -1
+            | mov [rsp+32], rax
+            | call_rr extern lua_settop, L_ARG, -3
+            | mov rax, [rsp+32]
+            break;
+
+        case FLOAT_TYPE:
+        case DOUBLE_TYPE:
+            lua_pop(L, 1);
+            | call_rr extern check_double, L_ARG, -1
+            |.if X64
+            | movq qword [rsp+32], xmm0
+            | call_rr extern lua_settop, L_ARG, -3
+            if (mt->type == FLOAT_TYPE) {
+                | cvtsd2ss xmm0, qword [rsp+32]
+            } else {
+                | movq xmm0, qword [rsp+32]
+            }
+            |.else
+            | fstp qword [rsp+32]
+            | call_rr extern lua_settop, L_ARG, -3
+            | fld qword [rsp+32]
+            |.endif
+            break;
+
+        case COMPLEX_FLOAT_TYPE:
+            lua_pop(L, 1);
+#if !defined HAVE_COMPLEX
+            luaL_error(L, "ffi lib compiled without complex number support");
+#endif
+            /* on 64 bit complex floats are two floats packed into a double,
+             * on 32 bit returned complex floats use eax and edx */
+            | call_rr extern check_complex_float, L_ARG, -1
+            |
+            |.if X64
+            | movq qword [rsp+32], xmm0
+            |.else
+            | mov [rsp+32], eax
+            | mov [rsp+36], edx
+            |.endif
+            |
+            | call_rr extern lua_settop, L_ARG, -3
+            |
+            |.if X64
+            | movq xmm0, qword [rsp+32]
+            |.else
+            | mov eax, [rsp+32]
+            | mov edx, [rsp+36]
+            |.endif
+            break;
+
+        case COMPLEX_DOUBLE_TYPE:
+            lua_pop(L, 1);
+#if !defined HAVE_COMPLEX
+            luaL_error(L, "ffi lib compiled without complex number support");
+#endif
+            /* on 64 bit, returned complex doubles use xmm0, xmm1, on 32 bit
+             * there is a hidden first parameter that points to 16 bytes where
+             * the returned arg is stored which is popped by the called
+             * function */
+#if defined _WIN64 || defined __amd64__
+            | call_rr extern check_complex_double, L_ARG, -1
+            | movq qword [rsp+32], xmm0
+            | movq qword [rsp+40], xmm1
+            | call_rr extern lua_settop, L_ARG, -3
+            | movq xmm0, qword [rsp+32]
+            | movq xmm1, qword [rsp+40]
+#else
+            | mov rcx, [rbp + hidden_arg_off]
+            | call_rrr extern check_complex_double, rcx, L_ARG, -1
+            | sub rsp, 4 // to realign from popped hidden arg
+            | call_rr extern lua_settop, L_ARG, -3
+#endif
+            break;
+
+        default:
+            luaL_error(L, "NYI: callback return type");
+        }
+    }
+
+    |.if X64
+    | mov L_ARG, [rbp-8]
+    |.else
+    | mov L_ARG, [rbp-4]
+    |.endif
+    | mov rsp, rbp
+    | pop rbp
+    | ret x86_return_size(L, ct_usr, ct)
+
+    lua_pop(L, 1); /* upval table - already in registry */
+    assert(lua_gettop(L) == top);
+
+    ct2.is_jitted = 1;
+    pf = (cfunction*) push_cdata(L, ct_usr, &ct2);
+    *pf = compile(Dst, L, NULL, ref);
+
+    assert(lua_gettop(L) == top + 1);
+
+    return *pf;
+}
+
+void compile_function(lua_State* L, cfunction func, int ct_usr, const struct ctype* ct)
+{
+    size_t i, nargs;
+    int num_upvals;
+    const struct ctype* mbr_ct;
+    struct jit* Dst = get_jit(L);
+    struct reg_alloc reg;
+    void* p;
+    int top = lua_gettop(L);
+    int* perr = &Dst->last_errno;
+
+    ct_usr = lua_absindex(L, ct_usr);
+
+    memset(&reg, 0, sizeof(reg));
+    reg.off = 32 + REGISTER_STACK_SPACE(ct);
+
+    dasm_setup(Dst, build_actionlist);
+
+    p = push_cdata(L, ct_usr, ct);
+    *(cfunction*) p = func;
+    num_upvals = 1;
+
+    nargs = lua_rawlen(L, ct_usr);
+
+    if (ct->calling_convention != C_CALL && ct->has_var_arg) {
+        luaL_error(L, "vararg is only allowed with the c calling convention");
+    }
+
+    | push rbp
+    | mov rbp, rsp
+    | push L_ARG
+    | push TOP
+    | // stack is 0 (mod 16) (TOP, L_ARG, rbp, rip)
+    |
+    | // Get L from our arguments and allocate some stack for lua_gettop
+    |.if X64WIN
+    | mov L_ARG, rcx
+    | sub rsp, 32 // shadow space
+    |.elif X64
+    | mov L_ARG, rdi
+    |.else
+    | mov L_ARG, [rbp + 8]
+    | sub rsp, 16
+    |.endif
+    |
+    | call_r extern lua_gettop, L_ARG
+    | mov TOP, rax // no need for movzxd rax, eax - high word guarenteed to be zero by x86-64
+    | cmp rax, nargs
+    | jl ->too_few_arguments
+
+    if (!ct->has_var_arg) {
+        | jg ->too_many_arguments
+    }
+
+    /* no need to zero extend eax returned by lua_gettop to rax as x86-64
+     * preguarentees that the upper 32 bits will be zero */
+    | shl rax, 4 // reserve 16 bytes per argument - this maintains the alignment mod 16
+    | sub rsp, rax
+    | sub rsp, 32 + REGISTER_STACK_SPACE(ct) // reserve an extra 32 to call local functions
+
+#if !defined _WIN64 && !defined __amd64__
+    /* Returned complex doubles require a hidden first parameter where the
+     * data is stored, which is popped by the calling code. */
+    lua_rawgeti(L, ct_usr, 0);
+    mbr_ct = (const struct ctype*) lua_touserdata(L, -1);
+    if (!mbr_ct->pointers && mbr_ct->type == COMPLEX_DOUBLE_TYPE) {
+        /* we can allocate more space for arguments as long as no add_*
+         * function has been called yet, mbr_ct will be added as an upvalue in
+         * the return processing later */
+        | call_rrp extern push_cdata, L_ARG, 0, mbr_ct
+        | sub rsp, 16
+        add_pointer(Dst, ct, &reg);
+    }
+    lua_pop(L, 1);
+#endif
+
+    for (i = 1; i <= nargs; i++) {
+        lua_rawgeti(L, ct_usr, (int) i);
+        mbr_ct = (const struct ctype*) lua_touserdata(L, -1);
+
+        if (mbr_ct->pointers) {
+            lua_getuservalue(L, -1);
+            num_upvals += 2;
+            | call_rrrp extern check_typed_pointer, L_ARG, i, lua_upvalueindex(num_upvals), mbr_ct
+            add_pointer(Dst, ct, &reg);
+        } else {
+            switch (mbr_ct->type) {
+            case FUNCTION_PTR_TYPE:
+                lua_getuservalue(L, -1);
+                num_upvals += 2;
+                | call_rrrp extern check_typed_cfunction, L_ARG, i, lua_upvalueindex(num_upvals), mbr_ct
+                add_pointer(Dst, ct, &reg);
+                break;
+
+            case ENUM_TYPE:
+                lua_getuservalue(L, -1);
+                num_upvals += 2;
+                | call_rrrp, extern check_enum, L_ARG, i, lua_upvalueindex(num_upvals), mbr_ct
+                add_int(Dst, ct, &reg, 0);
+                break;
+
+            case INT8_TYPE:
+                | call_rr extern check_int32, L_ARG, i
+                if (mbr_ct->is_unsigned) {
+                    | movzx eax, al
+                } else {
+                    | movsx eax, al
+                }
+                add_int(Dst, ct, &reg, 0);
+                lua_pop(L, 1);
+                break;
+
+            case INT16_TYPE:
+                | call_rr extern check_int32, L_ARG, i
+                if (mbr_ct->is_unsigned) {
+                    | movzx eax, ax
+                } else {
+                    | movsx eax, ax
+                }
+                add_int(Dst, ct, &reg, 0);
+                lua_pop(L, 1);
+                break;
+
+            case BOOL_TYPE:
+                | call_rr extern check_int32, L_ARG, i
+                | cmp eax, 0
+                | setne al
+                | movzx eax, al
+                add_int(Dst, ct, &reg, 0);
+                lua_pop(L, 1);
+                break;
+
+            case INT32_TYPE:
+                if (mbr_ct->is_unsigned) {
+                    | call_rr extern check_uint32, L_ARG, i
+                } else {
+                    | call_rr extern check_int32, L_ARG, i
+                }
+                add_int(Dst, ct, &reg, 0);
+                lua_pop(L, 1);
+                break;
+
+            case INTPTR_TYPE:
+                | call_rr extern check_uintptr, L_ARG, i
+                add_pointer(Dst, ct, &reg);
+                lua_pop(L, 1);
+                break;
+
+            case INT64_TYPE:
+                if (mbr_ct->is_unsigned) {
+                    | call_rr extern check_uint64, L_ARG, i
+                } else {
+                    | call_rr extern check_int64, L_ARG, i
+                }
+                add_int(Dst, ct, &reg, 1);
+                lua_pop(L, 1);
+                break;
+
+            case DOUBLE_TYPE:
+                | call_rr extern check_double, L_ARG, i
+                add_float(Dst, ct, &reg, 1);
+                lua_pop(L, 1);
+                break;
+
+            case COMPLEX_DOUBLE_TYPE:
+                /* on 64 bit, returned complex doubles use xmm0, xmm1, on 32 bit
+                 * there is a hidden first parameter that points to 16 bytes where
+                 * the returned arg is stored (this is popped by the called
+                 * function) */
+#if defined _WIN64 || defined __amd64__
+                | call_rr extern check_complex_double, L_ARG, i
+                add_float(Dst, ct, &reg, 1);
+                | movq xmm0, xmm1
+                add_float(Dst, ct, &reg, 1);
+#else
+                | lea rax, [rsp+reg.off]
+                | sub rsp, 4
+                | call_rrr extern check_complex_double, rax, L_ARG, i
+                reg.off += 16;
+#endif
+                lua_pop(L, 1);
+                break;
+
+            case FLOAT_TYPE:
+                | call_rr extern check_double, L_ARG, i
+                add_float(Dst, ct, &reg, 0);
+                lua_pop(L, 1);
+                break;
+
+            case COMPLEX_FLOAT_TYPE:
+#if defined _WIN64 || defined __amd64__
+                | call_rr extern check_complex_float, L_ARG, i
+                /* complex floats are two floats packed into a double */
+                add_float(Dst, ct, &reg, 1);
+#else
+                /* returned complex floats use eax and edx */
+                | call_rr extern check_complex_float, L_ARG, i
+                | mov [rsp], eax
+                | fld dword [rsp]
+                add_float(Dst, ct, &reg, 0);
+                | mov [rsp], edx
+                | fld dword [rsp]
+                add_float(Dst, ct, &reg, 0);
+#endif
+                lua_pop(L, 1);
+                break;
+
+            default:
+                luaL_error(L, "NYI: call arg type");
+            }
+        }
+    }
+
+    if (ct->has_var_arg) {
+#ifdef _WIN64
+        |.if X64WIN
+        if (reg.regs < MAX_REGISTERS(ct)) {
+            assert(reg.regs == nargs);
+            | cmp TOP, MAX_REGISTERS(ct)
+            | jle >1
+            | // unpack onto stack
+            | mov rax, rsp
+            | add rax, 32 + 8*MAX_REGISTERS(ct)
+            | call_rrrr extern unpack_varargs_stack, L_ARG, MAX_REGISTERS(ct)+1, TOP, rax
+            | // unpack to registers
+            | mov rax, rsp
+            | add rax, 32 + 8*(reg.regs)
+            | call_rrrr extern unpack_varargs_reg, L_ARG, nargs+1, MAX_REGISTERS(ct), rax
+            | jmp >2
+            |1:
+            | // unpack just to registers
+            | mov rax, rsp
+            | add rax, 32 + 8*(reg.regs)
+            | call_rrrr extern unpack_varargs_reg, L_ARG, nargs+1, TOP, rax
+            |2:
+        } else {
+            | // unpack just to stack
+            | mov rax, rsp
+            | add rax, reg.off
+            | call_rrrr extern unpack_varargs_stack, L_ARG, nargs+1, TOP, rax
+        }
+
+        for (i = nargs; i < MAX_REGISTERS(ct); i++) {
+            reg.is_int[i] = reg.is_float[i] = 1;
+        }
+        reg.regs = MAX_REGISTERS(ct);
+#elif defined __amd64__
+        |.elif X64
+        if (reg.floats < MAX_FLOAT_REGISTERS(ct)) {
+            | mov rax, rsp
+            | add rax, 32 + 8*(MAX_INT_REGISTERS(ct) + reg.floats)
+            | call_rrrrr extern unpack_varargs_float, L_ARG, nargs+1, TOP, MAX_FLOAT_REGISTERS(ct) - reg.floats, rax
+        }
+
+        if (reg.ints < MAX_INT_REGISTERS(ct)) {
+            | mov rax, rsp
+            | add rax, 32 + 8*(reg.ints)
+            | call_rrrrr extern unpack_varargs_int, L_ARG, nargs+1, TOP, MAX_INT_REGISTERS(ct) - reg.ints, rax
+        }
+
+        | mov rax, rsp
+        | add rax, reg.off
+        | call_rrrrrr extern unpack_varargs_stack_skip, L_ARG, nargs+1, TOP, MAX_INT_REGISTERS(ct) - reg.ints, MAX_FLOAT_REGISTERS(ct) - reg.floats, rax
+
+        reg.floats = MAX_FLOAT_REGISTERS(ct);
+        reg.ints = MAX_INT_REGISTERS(ct);
+#else
+        |.else
+        | mov rax, rsp
+        | add rax, reg.off
+        | call_rrrr extern unpack_varargs_stack, L_ARG, nargs+1, TOP, rax
+        |.endif
+#endif
+    }
+
+    | mov64 rcx, perr
+    | mov eax, dword [rcx]
+    | call_r extern SetLastError, rax
+
+    /* remove the stack space to call local functions */
+    |.if X32WIN
+    | add rsp, 28 // SetLastError will have already popped 4
+    |.else
+    | add rsp, 32
+    |.endif
+
+#ifdef _WIN64
+    |.if X64WIN
+    switch (reg.regs) {
+    case 4:
+        if (reg.is_float[3]) {
+            | movq xmm3, qword [rsp + 8*3]
+        }
+        if (reg.is_int[3]) {
+            | mov r9, [rsp + 8*3]
+        }
+    case 3:
+        if (reg.is_float[2]) {
+            | movq xmm2, qword [rsp + 8*2]
+        }
+        if (reg.is_int[2]) {
+            | mov r8, [rsp + 8*2]
+        }
+    case 2:
+        if (reg.is_float[1]) {
+            | movq xmm1, qword [rsp + 8*1]
+        }
+        if (reg.is_int[1]) {
+            | mov rdx, [rsp + 8*1]
+        }
+    case 1:
+        if (reg.is_float[0]) {
+            | movq xmm0, qword [rsp]
+        }
+        if (reg.is_int[0]) {
+            | mov rcx, [rsp]
+        }
+    case 0:
+        break;
+    }
+
+    /* don't remove the space for the registers as we need 32 bytes of register overflow space */
+    assert(REGISTER_STACK_SPACE(ct) == 32);
+
+#elif defined __amd64__
+    |.elif X64
+    switch (reg.floats) {
+    case 8:
+        | movq xmm7, qword [rsp + 8*(MAX_INT_REGISTERS(ct)+7)]
+    case 7:
+        | movq xmm6, qword [rsp + 8*(MAX_INT_REGISTERS(ct)+6)]
+    case 6:
+        | movq xmm5, qword [rsp + 8*(MAX_INT_REGISTERS(ct)+5)]
+    case 5:
+        | movq xmm4, qword [rsp + 8*(MAX_INT_REGISTERS(ct)+4)]
+    case 4:
+        | movq xmm3, qword [rsp + 8*(MAX_INT_REGISTERS(ct)+3)]
+    case 3:
+        | movq xmm2, qword [rsp + 8*(MAX_INT_REGISTERS(ct)+2)]
+    case 2:
+        | movq xmm1, qword [rsp + 8*(MAX_INT_REGISTERS(ct)+1)]
+    case 1:
+        | movq xmm0, qword [rsp + 8*(MAX_INT_REGISTERS(ct))]
+    case 0:
+        break;
+    }
+
+    switch (reg.ints) {
+    case 6:
+        | mov r9, [rsp + 8*5]
+    case 5:
+        | mov r8, [rsp + 8*4]
+    case 4:
+        | mov rcx, [rsp + 8*3]
+    case 3:
+        | mov rdx, [rsp + 8*2]
+    case 2:
+        | mov rsi, [rsp + 8*1]
+    case 1:
+        | mov rdi, [rsp]
+    case 0:
+        break;
+    }
+
+    | add rsp, REGISTER_STACK_SPACE(ct)
+#else
+    |.else
+    if (ct->calling_convention == FAST_CALL) {
+        switch (reg.ints) {
+        case 2:
+            | mov edx, [rsp + 4]
+        case 1:
+            | mov ecx, [rsp]
+        case 0:
+            break;
+        }
+
+        | add rsp, REGISTER_STACK_SPACE(ct)
+    }
+    |.endif
+#endif
+
+#ifdef __amd64__
+    if (ct->has_var_arg) {
+        /* al stores an upper limit on the number of float register, note that
+         * its allowed to be more than the actual number of float registers used as
+         * long as its 0-8 */
+        |.if X64 and not X64WIN
+        | mov al, 8
+        |.endif
+    }
+#endif
+
+    | call extern FUNCTION
+    | sub rsp, 48 // 32 to be able to call local functions, 16 so we can store some local variables
+
+    /* note on windows X86 the stack may be only aligned to 4 (stdcall will
+     * have popped a multiple of 4 bytes), but we don't need 16 byte alignment on
+     * that platform
+     */
+
+    lua_rawgeti(L, ct_usr, 0);
+    mbr_ct = (const struct ctype*) lua_touserdata(L, -1);
+
+    if (mbr_ct->pointers || mbr_ct->type == INTPTR_TYPE) {
+        lua_getuservalue(L, -1);
+        num_upvals += 2;
+        | mov [rsp+32], rax // save the pointer
+        | get_errno
+        | call_rrp extern push_cdata, L_ARG, lua_upvalueindex(num_upvals), mbr_ct
+        | mov rcx, [rsp+32]
+        | mov [rax], rcx // *(void**) cdata = val
+        | jmp ->lua_return_arg
+
+    } else {
+        switch (mbr_ct->type) {
+        case FUNCTION_PTR_TYPE:
+            lua_getuservalue(L, -1);
+            num_upvals += 2;
+            | mov [rsp+32], rax // save the function pointer
+            | get_errno
+            | call_rrp extern push_cdata, L_ARG, lua_upvalueindex(num_upvals), mbr_ct
+            | mov rcx, [rsp+32]
+            | mov [rax], rcx // *(cfunction**) cdata = val
+            | jmp ->lua_return_arg
+            break;
+
+        case INT64_TYPE:
+            num_upvals++;
+            | // save the return value
+            |.if X64
+            | mov [rsp+32], rax
+            |.else
+            | mov [rsp+36], edx // high
+            | mov [rsp+32], eax // low
+            |.endif
+            |
+            | get_errno
+            | call_rrp extern push_cdata, L_ARG, 0, mbr_ct
+            |
+            | // *(int64_t*) cdata = val
+            |.if X64
+            | mov rcx, [rsp+32]
+            | mov [rax], rcx
+            |.else
+            | mov rcx, [rsp+36]
+            | mov rdx, [rsp+32]
+            | mov [rax+4], rcx
+            | mov [rax], rdx
+            |.endif
+            |
+            | jmp ->lua_return_arg
+            break;
+
+        case COMPLEX_FLOAT_TYPE:
+            num_upvals++;
+            |.if X64
+            | // complex floats are returned as two floats packed into xmm0
+            | movq qword [rsp+32], xmm0
+            |.else
+            | // complex floats are returned as floats in eax and edx
+            | mov [rsp+32], eax
+            | mov [rsp+36], edx
+            |.endif
+            |
+            | get_errno
+            | call_rrp extern push_cdata, L_ARG, 0, mbr_ct
+            |
+            | // ((complex_float*) cdata) = val
+            |.if X64
+            | mov rcx, [rsp+32]
+            | mov [rax], rcx
+            |.else
+            | mov ecx, [rsp+32]
+            | mov [rax], ecx
+            | mov ecx, [rsp+36]
+            | mov [rax+4], ecx
+            |.endif
+            |
+            | jmp ->lua_return_arg
+            break;
+
+        case COMPLEX_DOUBLE_TYPE:
+            num_upvals++;
+            |.if X64
+            | // complex doubles are returned as xmm0 and xmm1
+            | movq qword [rsp+40], xmm1
+            | movq qword [rsp+32], xmm0
+            |
+            | get_errno
+            | call_rrp extern push_cdata, L_ARG, 0, mbr_ct
+            |
+            | // ((complex_double*) cdata)->real = val0
+            | // ((complex_double*) cdata)->imag = val1
+            | mov rcx, [rsp+40]
+            | mov [rax+8], rcx
+            | mov rcx, [rsp+32]
+            | mov [rax], rcx
+            |
+            |.else
+            | // On 32 bit we have already handled this by pushing a new cdata
+            | // and handing the cdata ptr in as the hidden first param, but
+            | // still need to add mbr_ct as an upval as its used earlier.
+            | // Hidden param was popped by called function, we need to realign.
+            | sub rsp, 4
+            | get_errno
+            |.endif
+            |
+            | jmp ->lua_return_arg
+            break;
+
+        case VOID_TYPE:
+            lua_pop(L, 1);
+            | jmp ->lua_return_void
+            break;
+
+        case BOOL_TYPE:
+            lua_pop(L, 1);
+            | jmp ->lua_return_bool
+            break;
+
+        case INT8_TYPE:
+            lua_pop(L, 1);
+            if (mbr_ct->is_unsigned) {
+                | movzx eax, al
+            } else {
+                | movsx eax, al
+            }
+            | jmp ->lua_return_int
+            break;
+
+        case INT16_TYPE:
+            lua_pop(L, 1);
+            if (mbr_ct->is_unsigned) {
+                | movzx eax, ax
+            } else {
+                | movsx eax, ax
+            }
+            | jmp ->lua_return_int
+            break;
+
+        case INT32_TYPE:
+        case ENUM_TYPE:
+            lua_pop(L, 1);
+            if (mbr_ct->is_unsigned) {
+                | jmp ->lua_return_uint
+            } else {
+                | jmp ->lua_return_int
+            }
+            break;
+
+        case FLOAT_TYPE:
+            lua_pop(L, 1);
+            |.if X64
+            | cvtss2sd xmm0, xmm0
+            |.endif
+            | jmp ->lua_return_double
+            break;
+
+        case DOUBLE_TYPE:
+            lua_pop(L, 1);
+            | jmp ->lua_return_double
+            break;
+
+        default:
+            luaL_error(L, "NYI: call return type");
+        }
+    }
+
+    assert(lua_gettop(L) == top + num_upvals);
+    {
+        cfunction f = compile(Dst, L, func, LUA_NOREF);
+        /* add a callback as an upval so that the jitted code gets cleaned up when
+         * the function gets gc'd */
+        push_callback(L, f);
+        lua_pushcclosure(L, (lua_CFunction) f, num_upvals+1);
+    }
+}
+
-- 
cgit v1.2.3