From 34a2e860efda4d585503202b6019f9deec1acf7d Mon Sep 17 00:00:00 2001 From: Alexander Lantsev Date: Tue, 16 Feb 2016 11:29:23 +0000 Subject: git-svn-id: http://svn.miranda-ng.org/main/trunk@16284 1316c22d-e87f-b044-9b9b-93d7a3e3ba9c --- plugins/MirLua/Modules/luaffi/src/call_x86.dasc | 1594 +++++++++++++++++++++++ 1 file changed, 1594 insertions(+) create mode 100644 plugins/MirLua/Modules/luaffi/src/call_x86.dasc (limited to 'plugins/MirLua/Modules/luaffi/src/call_x86.dasc') diff --git a/plugins/MirLua/Modules/luaffi/src/call_x86.dasc b/plugins/MirLua/Modules/luaffi/src/call_x86.dasc new file mode 100644 index 0000000000..fa8271a2a1 --- /dev/null +++ b/plugins/MirLua/Modules/luaffi/src/call_x86.dasc @@ -0,0 +1,1594 @@ +/* vim: ts=4 sw=4 sts=4 et tw=78 + * Copyright (c) 2011 James R. McKaskill. See license in ffi.h + */ +|.if X64 +|.arch x64 +|.else +|.arch x86 +|.endif + +|.actionlist build_actionlist +|.globalnames globnames +|.externnames extnames + +|.if not X64 +|.define RET_H, edx // for int64_t returns +|.define RET_L, eax +|.endif + +|.if X64WIN +| +|.macro call_rrrp, func, arg0, arg1, arg2, arg3 +| mov64 r9, arg3 +| mov r8, arg2 +| mov rdx, arg1 +| mov rcx, arg0 +| call func +|.endmacro +|.macro call_rrrr, func, arg0, arg1, arg2, arg3 +| mov r9, arg3 +| mov r8, arg2 +| mov rdx, arg1 +| mov rcx, arg0 +| call func +|.endmacro +| +|.macro call_rrp, func, arg0, arg1, arg2 +| mov64 r8, arg2 +| mov rdx, arg1 +| mov rcx, arg0 +| call func +|.endmacro +|.macro call_rrr, func, arg0, arg1, arg2 +| mov r8, arg2 +| mov rdx, arg1 +| mov rcx, arg0 +| call func +|.endmacro +| +|.macro call_rp, func, arg0, arg1 +| mov64 rdx, arg1 +| mov rcx, arg0 +| call func +|.endmacro +|.macro call_rr, func, arg0, arg1 +| mov rdx, arg1 +| mov rcx, arg0 +| call func +|.endmacro +| +|.macro call_r, func, arg0 +| mov rcx, arg0 +| call func +|.endmacro +| +|.elif X64 +| +| // the 5 and 6 arg forms are only used on posix x64 +|.macro call_rrrrrr, func, arg0, arg1, arg2, arg3, arg4, arg5 +| mov r9, arg5 +| mov r8, arg4 +| mov rcx, arg3 +| mov rdx, arg2 +| mov rsi, arg1 +| mov rdi, arg0 +| call func +|.endmacro +|.macro call_rrrrr, func, arg0, arg1, arg2, arg3, arg4 +| mov r8, arg4 +| mov rcx, arg3 +| mov rdx, arg2 +| mov rsi, arg1 +| mov rdi, arg0 +| call func +|.endmacro +| +|.macro call_rrrp, func, arg0, arg1, arg2, arg3 +| mov64 rcx, arg3 +| mov rdx, arg2 +| mov rsi, arg1 +| mov rdi, arg0 +| call func +|.endmacro +|.macro call_rrrr, func, arg0, arg1, arg2, arg3 +| mov rcx, arg3 +| mov rdx, arg2 +| mov rsi, arg1 +| mov rdi, arg0 +| call func +|.endmacro +| +|.macro call_rrp, func, arg0, arg1, arg2 +| mov64 rdx, arg2 +| mov rsi, arg1 +| mov rdi, arg0 +| call func +|.endmacro +|.macro call_rrr, func, arg0, arg1, arg2 +| mov rdx, arg2 +| mov rsi, arg1 +| mov rdi, arg0 +| call func +|.endmacro +| +|.macro call_rp, func, arg0, arg1 +| mov64 rsi, arg1 +| mov rdi, arg0 +| call func +|.endmacro +|.macro call_rr, func, arg0, arg1 +| mov rsi, arg1 +| mov rdi, arg0 +| call func +|.endmacro +| +|.macro call_r, func, arg0 +| mov rdi, arg0 +| call func +|.endmacro +| +|.else +| // define the 64bit registers to the 32 bit counterparts, so the common +| // code can use r*x for all pointers +|.define rax, eax +|.define rcx, ecx +|.define rdx, edx +|.define rsp, esp +|.define rbp, ebp +|.define rdi, edi +|.define rsi, esi +|.define mov64, mov +| +|.macro call_rrrr, func, arg0, arg1, arg2, arg3 +| mov dword [rsp+12], arg3 +| mov dword [rsp+8], arg2 +| mov dword [rsp+4], arg1 +| mov dword [rsp], arg0 +| call func +|.endmacro +|.macro call_rrr, func, arg0, arg1, arg2 +| mov dword [rsp+8], arg2 +| mov dword [rsp+4], arg1 +| mov dword [rsp], arg0 +| call func +|.endmacro +|.macro call_rr, func, arg0, arg1 +| mov dword [rsp+4], arg1 +| mov dword [rsp], arg0 +| call func +|.endmacro +|.macro call_r, func, arg0 +| mov dword [rsp], arg0 +| call func +|.endmacro +| +|.define call_rrrp, call_rrrr +|.define call_rrp, call_rrr +|.define call_rp, call_rr +| +|.endif + +#if defined _WIN64 || defined __amd64__ +#define JUMP_SIZE 14 +#else +#define JUMP_SIZE 4 +#endif + +#define MIN_BRANCH INT32_MIN +#define MAX_BRANCH INT32_MAX +#define BRANCH_OFF 4 + +static void compile_extern_jump(struct jit* jit, lua_State* L, cfunction func, uint8_t* code) +{ + /* The jump code is the function pointer followed by a stub to call the + * function pointer. The stub exists in 64 bit so we can jump to functions + * with an offset greater than 2 GB. + * + * Note we have to manually set this up since there are commands buffered + * in the jit state and dynasm doesn't support rip relative addressing. + * + * eg on 64 bit: + * 0-8: function ptr + * 8-14: jmp aword [rip-14] + * + * for 32 bit we only set the function ptr as it can always fit in a 32 + * bit displacement + */ +#if defined _WIN64 || defined __amd64__ + *(cfunction*) code = func; + code[8] = 0xFF; /* FF /4 operand for jmp */ + code[9] = 0x25; /* RIP displacement */ + *(int32_t*) &code[10] = -14; +#else + *(cfunction*) code = func; +#endif +} + +void compile_globals(struct jit* jit, lua_State* L) +{ + struct jit* Dst = jit; + int* perr = &jit->last_errno; + dasm_setup(Dst, build_actionlist); + + /* Note: since the return code uses EBP to reset the stack pointer, we + * don't have to track the amount of stack space used. It also means we + * can handle stdcall and cdecl with the same code. + */ + + /* Note the various call_* functions want 32 bytes of 16 byte aligned + * stack + */ + + |.if X64 + |.define L_ARG, r12 + |.define TOP, r13 + |.else + |.define L_ARG, rdi + |.define TOP, rsi + |.endif + + |.macro epilog + |.if X64 + | mov TOP, [rbp-16] + | mov L_ARG, [rbp-8] + |.else + | mov TOP, [rbp-8] + | mov L_ARG, [rbp-4] + |.endif + | mov rsp, rbp + | pop rbp + | ret + |.endmacro + + |.macro get_errno // note trashes registers + | call extern GetLastError + | mov64 rcx, perr + | mov dword [rcx], eax + |.endmacro + + /* the general idea for the return functions is: + * 1) Save return value on stack + * 2) Call get_errno (this trashes the registers hence #1) + * 3) Unpack return value from stack + * 4) Call lua push function + * 5) Set eax to number of returned args (0 or 1) + * 6) Call return which pops our stack frame + */ + + |->lua_return_arg: + | mov eax, 1 + | epilog + + |->lua_return_void: + | get_errno + | mov eax, 0 + | epilog + + |->lua_return_double: + |.if X64 + | movq qword [rsp+32], xmm0 + |.else + | fstp qword [rsp+4] // note get_errno doesn't require any stack on x86 + |.endif + | + | get_errno + | + |.if X64WIN + | movq xmm1, qword [rsp+32] + | mov rcx, L_ARG + |.elif X64 + | movq xmm0, qword [rsp+32] + | mov rdi, L_ARG + |.else + | mov [rsp], L_ARG + |.endif + | call extern lua_pushnumber + | jmp ->lua_return_arg + + |->lua_return_bool: + | movzx eax, al + | mov [rsp+32], eax + | get_errno + | mov eax, [rsp+32] + | call_rr extern lua_pushboolean, L_ARG, rax + | jmp ->lua_return_arg + + |->lua_return_int: + | mov [rsp+32], eax + | get_errno + | mov eax, [rsp+32] + | call_rr extern push_int, L_ARG, rax + | jmp ->lua_return_arg + + |->lua_return_uint: + | mov [rsp+32], eax + | get_errno + | mov eax, [rsp+32] + | call_rr extern push_uint, L_ARG, rax + | jmp ->lua_return_arg + + |->too_few_arguments: + | mov ax, 0 + | call_rp extern luaL_error, L_ARG, &"too few arguments" + + |->too_many_arguments: + | mov ax, 0 + | call_rp extern luaL_error, L_ARG, &"too many arguments" + + |->save_registers: + | // use rbp relative so we store values in the outer stack frame + |.if X64WIN + | // use the provided shadow space for int registers above prev rbp and + | // return address + | mov [rbp+16], rcx + | mov [rbp+24], rdx + | mov [rbp+32], r8 + | mov [rbp+40], r9 + | // use the extra space we added for float registers + | // -16 to store underneath previous value of L_ARG + | movq qword [rbp-16], xmm0 + | movq qword [rbp-24], xmm1 + | movq qword [rbp-32], xmm2 + | movq qword [rbp-40], xmm3 + |.elif X64 + | movq qword [rbp-16], xmm0 + | movq qword [rbp-24], xmm1 + | movq qword [rbp-32], xmm2 + | movq qword [rbp-40], xmm3 + | movq qword [rbp-48], xmm4 + | movq qword [rbp-56], xmm5 + | movq qword [rbp-64], xmm6 + | movq qword [rbp-72], xmm7 + | mov [rbp-80], rdi + | mov [rbp-88], rsi + | mov [rbp-96], rdx + | mov [rbp-104], rcx + | mov [rbp-112], r8 + | mov [rbp-120], r9 + |.else + | // fastcall, -8 to store underneath previous value of L_ARG + | mov [rbp-8], ecx + | mov [rbp-12], edx + |.endif + | ret + + compile(Dst, L, NULL, LUA_NOREF); +} + +int x86_return_size(lua_State* L, int usr, const struct ctype* ct) +{ + int ret = 0; + const struct ctype* mt; + + if (ct->calling_convention != C_CALL) { + size_t i; + size_t argn = lua_rawlen(L, usr); + for (i = 1; i <= argn; i++) { + lua_rawgeti(L, usr, (int) i); + mt = (const struct ctype*) lua_touserdata(L, -1); + + if (mt->pointers) { + ret += sizeof(void*); + } else { + switch (mt->type) { + case DOUBLE_TYPE: + case COMPLEX_FLOAT_TYPE: + case INT64_TYPE: + ret += 8; + break; + case COMPLEX_DOUBLE_TYPE: + ret += 16; + break; + case INTPTR_TYPE: + ret += sizeof(intptr_t); + break; + case FUNCTION_PTR_TYPE: + ret += sizeof(cfunction); + break; + case BOOL_TYPE: + case FLOAT_TYPE: + case INT8_TYPE: + case INT16_TYPE: + case INT32_TYPE: + case ENUM_TYPE: + ret += 4; + break; + default: + return luaL_error(L, "NYI - argument type"); + } + } + + lua_pop(L, 1); + } + } + +#if !defined _WIN64 && !defined __amd64__ + lua_rawgeti(L, usr, 0); + mt = (const struct ctype*) lua_touserdata(L, -1); + if (!mt->pointers && mt->type == COMPLEX_DOUBLE_TYPE) { + ret += sizeof(void*); + } + lua_pop(L, 1); +#endif + + return ret; +} + +#ifdef _WIN64 +#define MAX_REGISTERS(ct) 4 /* rcx, rdx, r8, r9 */ + +#elif defined __amd64__ +#define MAX_INT_REGISTERS(ct) 6 /* rdi, rsi, rdx, rcx, r8, r9 */ +#define MAX_FLOAT_REGISTERS(ct) 8 /* xmm0-7 */ + +#else +#define MAX_INT_REGISTERS(ct) ((ct)->calling_convention == FAST_CALL ? 2 /* ecx, edx */ : 0) +#define MAX_FLOAT_REGISTERS(ct) 0 +#endif + +struct reg_alloc { +#ifdef _WIN64 + int regs; + int is_float[4]; + int is_int[4]; +#else + int floats; + int ints; +#endif + int off; +}; + +#ifdef _WIN64 +#define REGISTER_STACK_SPACE(ct) (4*8) +#elif defined __amd64__ +#define REGISTER_STACK_SPACE(ct) (14*8) +#else +#define REGISTER_STACK_SPACE(ct) ALIGN_UP(((ct)->calling_convention == FAST_CALL ? 2*4 : 0), 15) +#endif + +/* Fastcall: + * Uses ecx, edx as first two int registers + * Everything else on stack (include 64bit ints) + * No overflow stack space + * Pops the stack before returning + * Returns int in eax, float in ST0 + * We use the same register allocation logic as posix x64 with 2 int regs and 0 float regs + */ + +static void get_int(Dst_DECL, const struct ctype* ct, struct reg_alloc* reg, int is_int64) +{ + /* grab the register from the shadow space */ +#ifdef _WIN64 + if (reg->regs < MAX_REGISTERS(ct)) { + | mov rcx, [rbp + 16 + 8*reg->regs] + reg->regs++; + } +#elif __amd64__ + if (reg->ints < MAX_INT_REGISTERS(ct)) { + | mov rcx, [rbp - 80 - 8*reg->ints] + reg->ints++; + } +#else + if (!is_int64 && reg->ints < MAX_INT_REGISTERS(ct)) { + | mov ecx, [rbp - 8 - 4*reg->ints] + reg->ints++; + } +#endif + else if (is_int64) { + |.if X64 + | mov rcx, [rbp + reg->off] + |.else + | mov rcx, [rbp + reg->off] + | mov rdx, [rbp + reg->off + 4] + |.endif + reg->off += 8; + } else { + | mov ecx, [rbp + reg->off] + reg->off += 4; + } +} + +static void add_int(Dst_DECL, const struct ctype* ct, struct reg_alloc* reg, int is_int64) +{ +#ifdef _WIN64 + if (reg->regs < MAX_REGISTERS(ct)) { + | mov [rsp + 32 + 8*(reg->regs)], rax + reg->is_int[reg->regs++] = 1; + } +#elif __amd64__ + if (reg->ints < MAX_INT_REGISTERS(ct)) { + | mov [rsp + 32 + 8*reg->ints], rax + reg->ints++; + } +#else + if (!is_int64 && reg->ints < MAX_INT_REGISTERS(ct)) { + | mov [rsp + 32 + 4*reg->ints], rax + reg->ints++; + } +#endif + + else if (is_int64) { + |.if X64 + | mov [rsp + reg->off], rax + |.else + | mov [rsp + reg->off], RET_L + | mov [rsp + reg->off + 4], RET_H + |.endif + reg->off += 8; + } else { + | mov [rsp+reg->off], eax + reg->off += 4; + } +} + +static void get_float(Dst_DECL, const struct ctype* ct, struct reg_alloc* reg, int is_double) +{ +#if !defined _WIN64 && !defined __amd64__ + assert(MAX_FLOAT_REGISTERS(ct) == 0); + if (is_double) { + | fld qword [rbp + reg->off] + reg->off += 8; + } else { + | fld dword [rbp + reg->off] + reg->off += 4; + } +#else + int off; + +#ifdef _WIN64 + if (reg->regs < MAX_REGISTERS(ct)) { + off = -16 - 8*reg->regs; + reg->regs++; + } +#else + if (reg->floats < MAX_FLOAT_REGISTERS(ct)) { + off = -16 - 8*reg->floats; + reg->floats++; + } +#endif + else { + off = reg->off; + reg->off += is_double ? 8 : 4; + } + + if (is_double) { + | movq xmm0, qword [rbp + off] + } else { + | cvtss2sd xmm0, dword [rbp + off] + } +#endif +} + +static void add_float(Dst_DECL, const struct ctype* ct, struct reg_alloc* reg, int is_double) +{ +#if !defined _WIN64 && !defined __amd64__ + assert(MAX_FLOAT_REGISTERS(ct) == 0); + if (is_double) { + | fstp qword [rsp + reg->off] + reg->off += 8; + } else { + | fstp dword [rsp + reg->off] + reg->off += 4; + } +#else + +#ifdef _WIN64 + if (reg->regs < MAX_REGISTERS(ct)) { + if (is_double) { + | movq qword [rsp + 32 + 8*(reg->regs)], xmm0 + } else { + | cvtsd2ss xmm0, xmm0 + | movq qword [rsp + 32 + 8*(reg->regs)], xmm0 + } + reg->is_float[reg->regs++] = 1; + } +#else + if (reg->floats < MAX_FLOAT_REGISTERS(ct)) { + if (is_double) { + | movq qword [rsp + 32 + 8*(MAX_INT_REGISTERS(ct) + reg->floats)], xmm0 + } else { + | cvtsd2ss xmm0, xmm0 + | movq qword [rsp + 32 + 8*(MAX_INT_REGISTERS(ct) + reg->floats)], xmm0 + } + reg->floats++; + } +#endif + + else if (is_double) { + | movq qword [rsp + reg->off], xmm0 + reg->off += 8; + } else { + | cvtsd2ss xmm0, xmm0 + | movd dword [rsp + reg->off], xmm0 + reg->off += 4; + } +#endif +} + +#if defined _WIN64 || defined __amd64__ +#define add_pointer(jit, ct, reg) add_int(jit, ct, reg, 1) +#define get_pointer(jit, ct, reg) get_int(jit, ct, reg, 1) +#else +#define add_pointer(jit, ct, reg) add_int(jit, ct, reg, 0) +#define get_pointer(jit, ct, reg) get_int(jit, ct, reg, 0) +#endif + +cfunction compile_callback(lua_State* L, int fidx, int ct_usr, const struct ctype* ct) +{ + int i, nargs; + cfunction* pf; + struct ctype ct2 = *ct; + const struct ctype* mt; + struct reg_alloc reg; + int num_upvals = 0; + int top = lua_gettop(L); + struct jit* Dst = get_jit(L); + int ref; + int hidden_arg_off = 0; + + ct_usr = lua_absindex(L, ct_usr); + fidx = lua_absindex(L, fidx); + + assert(lua_isnil(L, fidx) || lua_isfunction(L, fidx)); + + memset(®, 0, sizeof(reg)); +#ifdef _WIN64 + reg.off = 16 + REGISTER_STACK_SPACE(ct); /* stack registers are above the shadow space */ +#elif __amd64__ + reg.off = 16; +#else + reg.off = 8; +#endif + + dasm_setup(Dst, build_actionlist); + + // add a table to store ctype and function upvalues + // callback_set assumes the first value is the lua function + nargs = (int) lua_rawlen(L, ct_usr); + lua_newtable(L); + lua_pushvalue(L, -1); + ref = luaL_ref(L, LUA_REGISTRYINDEX); + + if (ct->has_var_arg) { + luaL_error(L, "can't create callbacks with varargs"); + } + + // setup a stack frame to hold args for the call into lua_call + + | push rbp + | mov rbp, rsp + | push L_ARG + | // stack is 4 or 8 (mod 16) (L_ARG, rbp, rip) + |.if X64 + | // 8 to realign, 16 for return vars, 32 for local calls, rest to save registers + | sub rsp, 8 + 16 + 32 + REGISTER_STACK_SPACE(ct) + | call ->save_registers + |.else + | // 4 to realign, 16 for return vars, 32 for local calls, rest to save registers + | sub rsp, 4 + 16 + 32 + REGISTER_STACK_SPACE(ct) + if (ct->calling_convention == FAST_CALL) { + | call ->save_registers + } + |.endif + + // hardcode the lua_State* value into the assembly + | mov64 L_ARG, L + + /* get the upval table */ + | call_rrr extern lua_rawgeti, L_ARG, LUA_REGISTRYINDEX, ref + + /* get the lua function */ + lua_pushvalue(L, fidx); + lua_rawseti(L, -2, ++num_upvals); + assert(num_upvals == CALLBACK_FUNC_USR_IDX); + | call_rrr extern lua_rawgeti, L_ARG, -1, num_upvals + +#if !defined _WIN64 && !defined __amd64__ + lua_rawgeti(L, ct_usr, 0); + mt = (const struct ctype*) lua_touserdata(L, -1); + if (!mt->pointers && mt->type == COMPLEX_DOUBLE_TYPE) { + hidden_arg_off = reg.off; + reg.off += sizeof(void*); + } + lua_pop(L, 1); +#else + (void) hidden_arg_off; +#endif + + for (i = 1; i <= nargs; i++) { + lua_rawgeti(L, ct_usr, i); + mt = (const struct ctype*) lua_touserdata(L, -1); + + if (mt->pointers) { + lua_getuservalue(L, -1); + lua_rawseti(L, -3, ++num_upvals); /* usr value */ + lua_rawseti(L, -2, ++num_upvals); /* mt */ + /* on the lua stack in the callback: + * upval tbl, lua func, i-1 args + */ + | call_rrr extern lua_rawgeti, L_ARG, -i-1, num_upvals-1 + | call_rrp extern push_cdata, L_ARG, -1, mt + get_pointer(Dst, ct, ®); + | mov [rax], rcx + | call_rr, extern lua_remove, L_ARG, -2 + } else { + switch (mt->type) { + case INT64_TYPE: + lua_getuservalue(L, -1); + lua_rawseti(L, -3, ++num_upvals); /* mt */ + lua_pop(L, 1); + | call_rrp extern push_cdata, L_ARG, 0, mt + get_int(Dst, ct, ®, 1); + |.if X64 + | mov [rax], rcx + |.else + | mov [rax], ecx + | mov [rax+4], edx + |.endif + break; + + case INTPTR_TYPE: + lua_getuservalue(L, -1); + lua_rawseti(L, -3, ++num_upvals); /* mt */ + lua_pop(L, 1); + | call_rrp extern push_cdata, L_ARG, 0, mt + get_pointer(Dst, ct, ®); + | mov [rax], rcx + break; + + case COMPLEX_FLOAT_TYPE: + lua_pop(L, 1); +#if defined _WIN64 || defined __amd64__ + /* complex floats are two floats packed into a double */ + | call_rrp extern push_cdata, L_ARG, 0, mt + get_float(Dst, ct, ®, 1); + | movq qword [rax], xmm0 +#else + /* complex floats are real followed by imag on the stack */ + | call_rrp extern push_cdata, L_ARG, 0, mt + get_float(Dst, ct, ®, 0); + | fstp dword [rax] + get_float(Dst, ct, ®, 0); + | fstp dword [rax+4] +#endif + break; + + case COMPLEX_DOUBLE_TYPE: + lua_pop(L, 1); + | call_rrp extern push_cdata, L_ARG, 0, mt + /* real */ + get_float(Dst, ct, ®, 1); + |.if X64 + | movq qword [rax], xmm0 + |.else + | fstp qword [rax] + |.endif + /* imag */ + get_float(Dst, ct, ®, 1); + |.if X64 + | movq qword [rax+8], xmm0 + |.else + | fstp qword [rax+8] + |.endif + break; + + case FLOAT_TYPE: + case DOUBLE_TYPE: + lua_pop(L, 1); + get_float(Dst, ct, ®, mt->type == DOUBLE_TYPE); + |.if X64WIN + | movq xmm1, xmm0 + | mov rcx, L_ARG + |.elif X64 + | // for 64bit xmm0 is already set + | mov rdi, L_ARG + |.else + | fstp qword [rsp+4] + | mov [rsp], L_ARG + |.endif + | call extern lua_pushnumber + break; + + case BOOL_TYPE: + lua_pop(L, 1); + get_int(Dst, ct, ®, 0); + | movzx ecx, cl + | call_rr extern lua_pushboolean, L_ARG, rcx + break; + + case INT8_TYPE: + lua_pop(L, 1); + get_int(Dst, ct, ®, 0); + if (mt->is_unsigned) { + | movzx ecx, cl + } else { + | movsx ecx, cl + } + | call_rr extern push_int, L_ARG, rcx + break; + + case INT16_TYPE: + lua_pop(L, 1); + get_int(Dst, ct, ®, 0); + if (mt->is_unsigned) { + | movzx ecx, cx + } else { + | movsx ecx, cx + } + | call_rr extern push_int, L_ARG, rcx + break; + + case ENUM_TYPE: + case INT32_TYPE: + lua_pop(L, 1); + get_int(Dst, ct, ®, 0); + if (mt->is_unsigned) { + | call_rr extern push_uint, L_ARG, rcx + } else { + | call_rr extern push_int, L_ARG, rcx + } + break; + + default: + luaL_error(L, "NYI: callback arg type"); + } + } + } + + lua_rawgeti(L, ct_usr, 0); + mt = (const struct ctype*) lua_touserdata(L, -1); + + | call_rrrp extern lua_callk, L_ARG, nargs, (mt->pointers || mt->type != VOID_TYPE) ? 1 : 0, 0 + + // Unpack the return argument if not "void", also clean-up the lua stack + // to remove the return argument and bind table. Use lua_settop rather + // than lua_pop as lua_pop is implemented as a macro. + if (mt->pointers) { + lua_getuservalue(L, -1); + lua_rawseti(L, -3, ++num_upvals); /* usr value */ + lua_rawseti(L, -2, ++num_upvals); /* mt */ + | call_rrr extern lua_rawgeti, L_ARG, -2, num_upvals-1 + | call_rrrp extern check_typed_pointer, L_ARG, -2, -1, mt + | mov [rsp+32], rax + | call_rr extern lua_settop, L_ARG, -4 + | mov rax, [rsp+32] + + } else { + switch (mt->type) { + case ENUM_TYPE: + lua_getuservalue(L, -1); + lua_rawseti(L, -3, ++num_upvals); /* usr value */ + lua_rawseti(L, -2, ++num_upvals); /* mt */ + | call_rrr extern lua_rawgeti, L_ARG, -2, num_upvals-1 + | call_rrrp, extern check_enum, L_ARG, -2, -1, mt + | mov [rsp+32], eax + | call_rr extern lua_settop, L_ARG, -4 + | mov eax, [rsp+32] + break; + + case VOID_TYPE: + lua_pop(L, 1); + | call_rr extern lua_settop, L_ARG, -2 + break; + + case BOOL_TYPE: + case INT8_TYPE: + case INT16_TYPE: + case INT32_TYPE: + lua_pop(L, 1); + if (mt->is_unsigned) { + | call_rr extern check_uint32, L_ARG, -1 + } else { + | call_rr extern check_int32, L_ARG, -1 + } + | mov [rsp+32], eax + | call_rr extern lua_settop, L_ARG, -3 + | mov eax, [rsp+32] + break; + + case INT64_TYPE: + lua_pop(L, 1); + + if (mt->is_unsigned) { + | call_rr extern check_uint64, L_ARG, -1 + } else { + | call_rr extern check_int64, L_ARG, -1 + } + + |.if X64 + | mov [rsp+32], rax + |.else + | mov [rsp+32], RET_L + | mov [rsp+36], RET_H + |.endif + | call_rr extern lua_settop, L_ARG, -3 + |.if X64 + | mov rax, [rsp+32] + |.else + | mov RET_L, [rsp+32] + | mov RET_H, [rsp+36] + |.endif + break; + + case INTPTR_TYPE: + lua_pop(L, 1); + | call_rr extern check_uintptr, L_ARG, -1 + | mov [rsp+32], rax + | call_rr extern lua_settop, L_ARG, -3 + | mov rax, [rsp+32] + break; + + case FLOAT_TYPE: + case DOUBLE_TYPE: + lua_pop(L, 1); + | call_rr extern check_double, L_ARG, -1 + |.if X64 + | movq qword [rsp+32], xmm0 + | call_rr extern lua_settop, L_ARG, -3 + if (mt->type == FLOAT_TYPE) { + | cvtsd2ss xmm0, qword [rsp+32] + } else { + | movq xmm0, qword [rsp+32] + } + |.else + | fstp qword [rsp+32] + | call_rr extern lua_settop, L_ARG, -3 + | fld qword [rsp+32] + |.endif + break; + + case COMPLEX_FLOAT_TYPE: + lua_pop(L, 1); +#if !defined HAVE_COMPLEX + luaL_error(L, "ffi lib compiled without complex number support"); +#endif + /* on 64 bit complex floats are two floats packed into a double, + * on 32 bit returned complex floats use eax and edx */ + | call_rr extern check_complex_float, L_ARG, -1 + | + |.if X64 + | movq qword [rsp+32], xmm0 + |.else + | mov [rsp+32], eax + | mov [rsp+36], edx + |.endif + | + | call_rr extern lua_settop, L_ARG, -3 + | + |.if X64 + | movq xmm0, qword [rsp+32] + |.else + | mov eax, [rsp+32] + | mov edx, [rsp+36] + |.endif + break; + + case COMPLEX_DOUBLE_TYPE: + lua_pop(L, 1); +#if !defined HAVE_COMPLEX + luaL_error(L, "ffi lib compiled without complex number support"); +#endif + /* on 64 bit, returned complex doubles use xmm0, xmm1, on 32 bit + * there is a hidden first parameter that points to 16 bytes where + * the returned arg is stored which is popped by the called + * function */ +#if defined _WIN64 || defined __amd64__ + | call_rr extern check_complex_double, L_ARG, -1 + | movq qword [rsp+32], xmm0 + | movq qword [rsp+40], xmm1 + | call_rr extern lua_settop, L_ARG, -3 + | movq xmm0, qword [rsp+32] + | movq xmm1, qword [rsp+40] +#else + | mov rcx, [rbp + hidden_arg_off] + | call_rrr extern check_complex_double, rcx, L_ARG, -1 + | sub rsp, 4 // to realign from popped hidden arg + | call_rr extern lua_settop, L_ARG, -3 +#endif + break; + + default: + luaL_error(L, "NYI: callback return type"); + } + } + + |.if X64 + | mov L_ARG, [rbp-8] + |.else + | mov L_ARG, [rbp-4] + |.endif + | mov rsp, rbp + | pop rbp + | ret x86_return_size(L, ct_usr, ct) + + lua_pop(L, 1); /* upval table - already in registry */ + assert(lua_gettop(L) == top); + + ct2.is_jitted = 1; + pf = (cfunction*) push_cdata(L, ct_usr, &ct2); + *pf = compile(Dst, L, NULL, ref); + + assert(lua_gettop(L) == top + 1); + + return *pf; +} + +void compile_function(lua_State* L, cfunction func, int ct_usr, const struct ctype* ct) +{ + size_t i, nargs; + int num_upvals; + const struct ctype* mbr_ct; + struct jit* Dst = get_jit(L); + struct reg_alloc reg; + void* p; + int top = lua_gettop(L); + int* perr = &Dst->last_errno; + + ct_usr = lua_absindex(L, ct_usr); + + memset(®, 0, sizeof(reg)); + reg.off = 32 + REGISTER_STACK_SPACE(ct); + + dasm_setup(Dst, build_actionlist); + + p = push_cdata(L, ct_usr, ct); + *(cfunction*) p = func; + num_upvals = 1; + + nargs = lua_rawlen(L, ct_usr); + + if (ct->calling_convention != C_CALL && ct->has_var_arg) { + luaL_error(L, "vararg is only allowed with the c calling convention"); + } + + | push rbp + | mov rbp, rsp + | push L_ARG + | push TOP + | // stack is 0 (mod 16) (TOP, L_ARG, rbp, rip) + | + | // Get L from our arguments and allocate some stack for lua_gettop + |.if X64WIN + | mov L_ARG, rcx + | sub rsp, 32 // shadow space + |.elif X64 + | mov L_ARG, rdi + |.else + | mov L_ARG, [rbp + 8] + | sub rsp, 16 + |.endif + | + | call_r extern lua_gettop, L_ARG + | mov TOP, rax // no need for movzxd rax, eax - high word guarenteed to be zero by x86-64 + | cmp rax, nargs + | jl ->too_few_arguments + + if (!ct->has_var_arg) { + | jg ->too_many_arguments + } + + /* no need to zero extend eax returned by lua_gettop to rax as x86-64 + * preguarentees that the upper 32 bits will be zero */ + | shl rax, 4 // reserve 16 bytes per argument - this maintains the alignment mod 16 + | sub rsp, rax + | sub rsp, 32 + REGISTER_STACK_SPACE(ct) // reserve an extra 32 to call local functions + +#if !defined _WIN64 && !defined __amd64__ + /* Returned complex doubles require a hidden first parameter where the + * data is stored, which is popped by the calling code. */ + lua_rawgeti(L, ct_usr, 0); + mbr_ct = (const struct ctype*) lua_touserdata(L, -1); + if (!mbr_ct->pointers && mbr_ct->type == COMPLEX_DOUBLE_TYPE) { + /* we can allocate more space for arguments as long as no add_* + * function has been called yet, mbr_ct will be added as an upvalue in + * the return processing later */ + | call_rrp extern push_cdata, L_ARG, 0, mbr_ct + | sub rsp, 16 + add_pointer(Dst, ct, ®); + } + lua_pop(L, 1); +#endif + + for (i = 1; i <= nargs; i++) { + lua_rawgeti(L, ct_usr, (int) i); + mbr_ct = (const struct ctype*) lua_touserdata(L, -1); + + if (mbr_ct->pointers) { + lua_getuservalue(L, -1); + num_upvals += 2; + | call_rrrp extern check_typed_pointer, L_ARG, i, lua_upvalueindex(num_upvals), mbr_ct + add_pointer(Dst, ct, ®); + } else { + switch (mbr_ct->type) { + case FUNCTION_PTR_TYPE: + lua_getuservalue(L, -1); + num_upvals += 2; + | call_rrrp extern check_typed_cfunction, L_ARG, i, lua_upvalueindex(num_upvals), mbr_ct + add_pointer(Dst, ct, ®); + break; + + case ENUM_TYPE: + lua_getuservalue(L, -1); + num_upvals += 2; + | call_rrrp, extern check_enum, L_ARG, i, lua_upvalueindex(num_upvals), mbr_ct + add_int(Dst, ct, ®, 0); + break; + + case INT8_TYPE: + | call_rr extern check_int32, L_ARG, i + if (mbr_ct->is_unsigned) { + | movzx eax, al + } else { + | movsx eax, al + } + add_int(Dst, ct, ®, 0); + lua_pop(L, 1); + break; + + case INT16_TYPE: + | call_rr extern check_int32, L_ARG, i + if (mbr_ct->is_unsigned) { + | movzx eax, ax + } else { + | movsx eax, ax + } + add_int(Dst, ct, ®, 0); + lua_pop(L, 1); + break; + + case BOOL_TYPE: + | call_rr extern check_int32, L_ARG, i + | cmp eax, 0 + | setne al + | movzx eax, al + add_int(Dst, ct, ®, 0); + lua_pop(L, 1); + break; + + case INT32_TYPE: + if (mbr_ct->is_unsigned) { + | call_rr extern check_uint32, L_ARG, i + } else { + | call_rr extern check_int32, L_ARG, i + } + add_int(Dst, ct, ®, 0); + lua_pop(L, 1); + break; + + case INTPTR_TYPE: + | call_rr extern check_uintptr, L_ARG, i + add_pointer(Dst, ct, ®); + lua_pop(L, 1); + break; + + case INT64_TYPE: + if (mbr_ct->is_unsigned) { + | call_rr extern check_uint64, L_ARG, i + } else { + | call_rr extern check_int64, L_ARG, i + } + add_int(Dst, ct, ®, 1); + lua_pop(L, 1); + break; + + case DOUBLE_TYPE: + | call_rr extern check_double, L_ARG, i + add_float(Dst, ct, ®, 1); + lua_pop(L, 1); + break; + + case COMPLEX_DOUBLE_TYPE: + /* on 64 bit, returned complex doubles use xmm0, xmm1, on 32 bit + * there is a hidden first parameter that points to 16 bytes where + * the returned arg is stored (this is popped by the called + * function) */ +#if defined _WIN64 || defined __amd64__ + | call_rr extern check_complex_double, L_ARG, i + add_float(Dst, ct, ®, 1); + | movq xmm0, xmm1 + add_float(Dst, ct, ®, 1); +#else + | lea rax, [rsp+reg.off] + | sub rsp, 4 + | call_rrr extern check_complex_double, rax, L_ARG, i + reg.off += 16; +#endif + lua_pop(L, 1); + break; + + case FLOAT_TYPE: + | call_rr extern check_double, L_ARG, i + add_float(Dst, ct, ®, 0); + lua_pop(L, 1); + break; + + case COMPLEX_FLOAT_TYPE: +#if defined _WIN64 || defined __amd64__ + | call_rr extern check_complex_float, L_ARG, i + /* complex floats are two floats packed into a double */ + add_float(Dst, ct, ®, 1); +#else + /* returned complex floats use eax and edx */ + | call_rr extern check_complex_float, L_ARG, i + | mov [rsp], eax + | fld dword [rsp] + add_float(Dst, ct, ®, 0); + | mov [rsp], edx + | fld dword [rsp] + add_float(Dst, ct, ®, 0); +#endif + lua_pop(L, 1); + break; + + default: + luaL_error(L, "NYI: call arg type"); + } + } + } + + if (ct->has_var_arg) { +#ifdef _WIN64 + |.if X64WIN + if (reg.regs < MAX_REGISTERS(ct)) { + assert(reg.regs == nargs); + | cmp TOP, MAX_REGISTERS(ct) + | jle >1 + | // unpack onto stack + | mov rax, rsp + | add rax, 32 + 8*MAX_REGISTERS(ct) + | call_rrrr extern unpack_varargs_stack, L_ARG, MAX_REGISTERS(ct)+1, TOP, rax + | // unpack to registers + | mov rax, rsp + | add rax, 32 + 8*(reg.regs) + | call_rrrr extern unpack_varargs_reg, L_ARG, nargs+1, MAX_REGISTERS(ct), rax + | jmp >2 + |1: + | // unpack just to registers + | mov rax, rsp + | add rax, 32 + 8*(reg.regs) + | call_rrrr extern unpack_varargs_reg, L_ARG, nargs+1, TOP, rax + |2: + } else { + | // unpack just to stack + | mov rax, rsp + | add rax, reg.off + | call_rrrr extern unpack_varargs_stack, L_ARG, nargs+1, TOP, rax + } + + for (i = nargs; i < MAX_REGISTERS(ct); i++) { + reg.is_int[i] = reg.is_float[i] = 1; + } + reg.regs = MAX_REGISTERS(ct); +#elif defined __amd64__ + |.elif X64 + if (reg.floats < MAX_FLOAT_REGISTERS(ct)) { + | mov rax, rsp + | add rax, 32 + 8*(MAX_INT_REGISTERS(ct) + reg.floats) + | call_rrrrr extern unpack_varargs_float, L_ARG, nargs+1, TOP, MAX_FLOAT_REGISTERS(ct) - reg.floats, rax + } + + if (reg.ints < MAX_INT_REGISTERS(ct)) { + | mov rax, rsp + | add rax, 32 + 8*(reg.ints) + | call_rrrrr extern unpack_varargs_int, L_ARG, nargs+1, TOP, MAX_INT_REGISTERS(ct) - reg.ints, rax + } + + | mov rax, rsp + | add rax, reg.off + | call_rrrrrr extern unpack_varargs_stack_skip, L_ARG, nargs+1, TOP, MAX_INT_REGISTERS(ct) - reg.ints, MAX_FLOAT_REGISTERS(ct) - reg.floats, rax + + reg.floats = MAX_FLOAT_REGISTERS(ct); + reg.ints = MAX_INT_REGISTERS(ct); +#else + |.else + | mov rax, rsp + | add rax, reg.off + | call_rrrr extern unpack_varargs_stack, L_ARG, nargs+1, TOP, rax + |.endif +#endif + } + + | mov64 rcx, perr + | mov eax, dword [rcx] + | call_r extern SetLastError, rax + + /* remove the stack space to call local functions */ + |.if X32WIN + | add rsp, 28 // SetLastError will have already popped 4 + |.else + | add rsp, 32 + |.endif + +#ifdef _WIN64 + |.if X64WIN + switch (reg.regs) { + case 4: + if (reg.is_float[3]) { + | movq xmm3, qword [rsp + 8*3] + } + if (reg.is_int[3]) { + | mov r9, [rsp + 8*3] + } + case 3: + if (reg.is_float[2]) { + | movq xmm2, qword [rsp + 8*2] + } + if (reg.is_int[2]) { + | mov r8, [rsp + 8*2] + } + case 2: + if (reg.is_float[1]) { + | movq xmm1, qword [rsp + 8*1] + } + if (reg.is_int[1]) { + | mov rdx, [rsp + 8*1] + } + case 1: + if (reg.is_float[0]) { + | movq xmm0, qword [rsp] + } + if (reg.is_int[0]) { + | mov rcx, [rsp] + } + case 0: + break; + } + + /* don't remove the space for the registers as we need 32 bytes of register overflow space */ + assert(REGISTER_STACK_SPACE(ct) == 32); + +#elif defined __amd64__ + |.elif X64 + switch (reg.floats) { + case 8: + | movq xmm7, qword [rsp + 8*(MAX_INT_REGISTERS(ct)+7)] + case 7: + | movq xmm6, qword [rsp + 8*(MAX_INT_REGISTERS(ct)+6)] + case 6: + | movq xmm5, qword [rsp + 8*(MAX_INT_REGISTERS(ct)+5)] + case 5: + | movq xmm4, qword [rsp + 8*(MAX_INT_REGISTERS(ct)+4)] + case 4: + | movq xmm3, qword [rsp + 8*(MAX_INT_REGISTERS(ct)+3)] + case 3: + | movq xmm2, qword [rsp + 8*(MAX_INT_REGISTERS(ct)+2)] + case 2: + | movq xmm1, qword [rsp + 8*(MAX_INT_REGISTERS(ct)+1)] + case 1: + | movq xmm0, qword [rsp + 8*(MAX_INT_REGISTERS(ct))] + case 0: + break; + } + + switch (reg.ints) { + case 6: + | mov r9, [rsp + 8*5] + case 5: + | mov r8, [rsp + 8*4] + case 4: + | mov rcx, [rsp + 8*3] + case 3: + | mov rdx, [rsp + 8*2] + case 2: + | mov rsi, [rsp + 8*1] + case 1: + | mov rdi, [rsp] + case 0: + break; + } + + | add rsp, REGISTER_STACK_SPACE(ct) +#else + |.else + if (ct->calling_convention == FAST_CALL) { + switch (reg.ints) { + case 2: + | mov edx, [rsp + 4] + case 1: + | mov ecx, [rsp] + case 0: + break; + } + + | add rsp, REGISTER_STACK_SPACE(ct) + } + |.endif +#endif + +#ifdef __amd64__ + if (ct->has_var_arg) { + /* al stores an upper limit on the number of float register, note that + * its allowed to be more than the actual number of float registers used as + * long as its 0-8 */ + |.if X64 and not X64WIN + | mov al, 8 + |.endif + } +#endif + + | call extern FUNCTION + | sub rsp, 48 // 32 to be able to call local functions, 16 so we can store some local variables + + /* note on windows X86 the stack may be only aligned to 4 (stdcall will + * have popped a multiple of 4 bytes), but we don't need 16 byte alignment on + * that platform + */ + + lua_rawgeti(L, ct_usr, 0); + mbr_ct = (const struct ctype*) lua_touserdata(L, -1); + + if (mbr_ct->pointers || mbr_ct->type == INTPTR_TYPE) { + lua_getuservalue(L, -1); + num_upvals += 2; + | mov [rsp+32], rax // save the pointer + | get_errno + | call_rrp extern push_cdata, L_ARG, lua_upvalueindex(num_upvals), mbr_ct + | mov rcx, [rsp+32] + | mov [rax], rcx // *(void**) cdata = val + | jmp ->lua_return_arg + + } else { + switch (mbr_ct->type) { + case FUNCTION_PTR_TYPE: + lua_getuservalue(L, -1); + num_upvals += 2; + | mov [rsp+32], rax // save the function pointer + | get_errno + | call_rrp extern push_cdata, L_ARG, lua_upvalueindex(num_upvals), mbr_ct + | mov rcx, [rsp+32] + | mov [rax], rcx // *(cfunction**) cdata = val + | jmp ->lua_return_arg + break; + + case INT64_TYPE: + num_upvals++; + | // save the return value + |.if X64 + | mov [rsp+32], rax + |.else + | mov [rsp+36], edx // high + | mov [rsp+32], eax // low + |.endif + | + | get_errno + | call_rrp extern push_cdata, L_ARG, 0, mbr_ct + | + | // *(int64_t*) cdata = val + |.if X64 + | mov rcx, [rsp+32] + | mov [rax], rcx + |.else + | mov rcx, [rsp+36] + | mov rdx, [rsp+32] + | mov [rax+4], rcx + | mov [rax], rdx + |.endif + | + | jmp ->lua_return_arg + break; + + case COMPLEX_FLOAT_TYPE: + num_upvals++; + |.if X64 + | // complex floats are returned as two floats packed into xmm0 + | movq qword [rsp+32], xmm0 + |.else + | // complex floats are returned as floats in eax and edx + | mov [rsp+32], eax + | mov [rsp+36], edx + |.endif + | + | get_errno + | call_rrp extern push_cdata, L_ARG, 0, mbr_ct + | + | // ((complex_float*) cdata) = val + |.if X64 + | mov rcx, [rsp+32] + | mov [rax], rcx + |.else + | mov ecx, [rsp+32] + | mov [rax], ecx + | mov ecx, [rsp+36] + | mov [rax+4], ecx + |.endif + | + | jmp ->lua_return_arg + break; + + case COMPLEX_DOUBLE_TYPE: + num_upvals++; + |.if X64 + | // complex doubles are returned as xmm0 and xmm1 + | movq qword [rsp+40], xmm1 + | movq qword [rsp+32], xmm0 + | + | get_errno + | call_rrp extern push_cdata, L_ARG, 0, mbr_ct + | + | // ((complex_double*) cdata)->real = val0 + | // ((complex_double*) cdata)->imag = val1 + | mov rcx, [rsp+40] + | mov [rax+8], rcx + | mov rcx, [rsp+32] + | mov [rax], rcx + | + |.else + | // On 32 bit we have already handled this by pushing a new cdata + | // and handing the cdata ptr in as the hidden first param, but + | // still need to add mbr_ct as an upval as its used earlier. + | // Hidden param was popped by called function, we need to realign. + | sub rsp, 4 + | get_errno + |.endif + | + | jmp ->lua_return_arg + break; + + case VOID_TYPE: + lua_pop(L, 1); + | jmp ->lua_return_void + break; + + case BOOL_TYPE: + lua_pop(L, 1); + | jmp ->lua_return_bool + break; + + case INT8_TYPE: + lua_pop(L, 1); + if (mbr_ct->is_unsigned) { + | movzx eax, al + } else { + | movsx eax, al + } + | jmp ->lua_return_int + break; + + case INT16_TYPE: + lua_pop(L, 1); + if (mbr_ct->is_unsigned) { + | movzx eax, ax + } else { + | movsx eax, ax + } + | jmp ->lua_return_int + break; + + case INT32_TYPE: + case ENUM_TYPE: + lua_pop(L, 1); + if (mbr_ct->is_unsigned) { + | jmp ->lua_return_uint + } else { + | jmp ->lua_return_int + } + break; + + case FLOAT_TYPE: + lua_pop(L, 1); + |.if X64 + | cvtss2sd xmm0, xmm0 + |.endif + | jmp ->lua_return_double + break; + + case DOUBLE_TYPE: + lua_pop(L, 1); + | jmp ->lua_return_double + break; + + default: + luaL_error(L, "NYI: call return type"); + } + } + + assert(lua_gettop(L) == top + num_upvals); + { + cfunction f = compile(Dst, L, func, LUA_NOREF); + /* add a callback as an upval so that the jitted code gets cleaned up when + * the function gets gc'd */ + push_callback(L, f); + lua_pushcclosure(L, (lua_CFunction) f, num_upvals+1); + } +} + -- cgit v1.2.3