20120903 - Linux Threading Via Syscalls


This post is for reference, so sorry in advance, this code is written mostly in my personal style from code taken from another personal project (mostly typeloose, as I often interchange between pointer and register sized integer type UP, and mixed with lots of macros).

Syscalls Without Headers
First x86-64 Linux system calls in C with inline assembly. There are six different functions for different amounts of arguments. The first parameter is always the system call number. For binary portability these system call numbers stay constant, and can be found in asm-x86/unistd_64.h under __NR_*. Keep in mind some of the glibc interfaces to the system calls don't always match the kernel interfaces. Note the ABI has a slightly different calling convention between userspace and syscalls, also "register" attributes are needed when inline assembly lacks certain contraints.

#define __I__ static __attribute__((always_inline))
typedef unsigned long UP; typedef signed long SP;

__I__ SP Linux0(UP num) { SP ret; asm volatile ("syscall" : "=a" (ret) : "a" (num) : "cc", "memory", "%rcx", "%rdx", "%rdi", "%rsi", "%r8", "%r9", "%r10", "%r11"); return ret; }

__I__ SP Linux1(UP num, UP arg1) { SP ret; asm volatile ("syscall" : "=a" (ret) : "a" (num), "D" (arg1) : "cc", "memory", "%rcx", "%rdx", "%rsi", "%r8", "%r9", "%r10", "%r11"); return ret; }

__I__ SP Linux2(UP num, UP arg1, UP arg2) { SP ret; asm volatile ("syscall" : "=a" (ret) : "a" (num), "D" (arg1), "S" (arg2) : "cc", "memory", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11"); return ret; }

__I__ SP Linux3(UP num, UP arg1, UP arg2, UP arg3) { SP ret; asm volatile ("syscall" : "=a" (ret) : "a" (num), "D" (arg1), "S" (arg2), "d" (arg3) : "cc", "memory", "%rcx", "%r8", "%r9", "%r10", "%r11"); return ret; }

__I__ SP Linux4(UP num, UP arg1, UP arg2, UP arg3, UP arg4) { SP ret; register UP larg4 asm("r10")=arg4; asm volatile ("syscall" : "=a" (ret) : "a" (num), "D" (arg1), "S" (arg2), "d" (arg3), "r" (larg4) : "cc", "memory", "%rcx", "%r8", "%r9", "%r11"); return ret; }

__I__ SP Linux5(UP num, UP arg1, UP arg2, UP arg3, UP arg4, UP arg5) { SP ret; register UP larg4 asm("r10")=arg4; register UP larg5 asm("r8")=arg5; asm volatile ("syscall" : "=a" (ret) : "a" (num), "D" (arg1), "S" (arg2), "d" (arg3), "r" (larg4), "r" (larg5) : "cc", "memory", "%rcx", "%r9", "%r11"); return ret; }

__I__ SP Linux6(UP num, UP arg1, UP arg2, UP arg3, UP arg4, UP arg5, UP arg6) { SP ret; register UP larg4 asm("r10")=arg4; register UP larg5 asm("r8")=arg5; register UP larg6 asm("r9")=arg6; asm volatile ("syscall" : "=a" (ret) : "a" (num), "D" (arg1), "S" (arg2), "d" (arg3), "r" (larg4), "r" (larg5), "r" (larg6) : "cc", "memory", "%rcx", "%r11"); return ret; }



Skipping Linux Headers
If you roll your own headers instead of including the kitchen sink,

// Clone* = CLONE_* values in "linux/sched.h"
// CloneWaitErrorRetry = -EINTR
// CloneWaitWClone = __WCLONE in "linux/wait.h"
enum { CloneFiles=0x0400, CloneFS=0x0200, CloneIO=0x80000000, CloneSigHand=0x0800, CloneSysVMem=0x040000, CloneVM=0x0100, CloneWaitErrorRetry=-4, CloneWaitWClone=0x80000000 };



Starting, Ending, and Waiting on Thread
Note this should just be three lines (but the browser wraps the define). Note the system call parameters are different than the glibc parameters, and the system call works like fork returning in the same place in code but returning a zero to the thread and the pid or -1 to the parent. Waiting for a thread created with clone to exit is not well documented. Call the wait4 syscall using the __WCLONE parameter.

__I__ void CloneExit(void) { Linux1(60, 0); }

#define Clone(id, Entry, stack, param) do { SP pid=Linux2(56, CloneFiles|CloneFS|CloneIO|CloneSigHand|CloneSysVMem|CloneVM, (stack)); if(pid==0) (Entry)((param)); id[0]=pid; } while(0)

__I__ void CloneWait(UP id) { while(1) { SP r=Linux4(61, id, 0, CloneWaitWClone, 0); if(r!=CloneWaitErrorRetry) break; } }


Example usage below. Make sure to use noinline on the thread entry function so that the compiler doesn't attempt to optimize and use data off the parent's stack. Also create page aligned stacks, and pass the highest address of the stack into the clone system call as stacks grow to smaller addresses. This example uses only four pages of stack space taken from the BSS segment. Raw threads created this way need to exit calling a syscall.

#define __S__ static
#define __A__(b) __attribute__((aligned(b)))
#define __N__ static __attribute__((noinline))
__S__ __A__(4096) UP myStack[(4096*4)/sizeof(UP)];
__S__ SP myId[1];
__N__ void MyFunction(UP param) { CloneExit(); }
...
Clone(myId, MyFunction, UP_(myStack)+UP_(sizeof(myStack)), 0);
...
CloneWait((UP)(myId[0]));