; Listing5-10.asm ; ; A program that demonstrates running a main program ; scheduling several worker threads. option casemap:none include aoalib.inc ;AoA library + constants includelib aoalib.lib ;Link in aoalib library include c:\masm32\include64\win64.inc include c:\masm32\include64\kernel32.inc includelib c:\masm32\lib64\kernel32.lib maxCPUs = 32 ;Must be <= 32. numThreads = 8 .const ; Program title: align word ttlStr byte "Listing 5-10", 0 ; The following structure gets passed to ; each thread when it is created: parm_t struct affinity dword ? threadNum dword ? parm_t ends .data hProcess qword ? PrcsAffMsk qword ? SysAffMsk qword ? CoresToUse qword ? Cores qword ? ; Array to hold parameters passed to threads. ; Elements are indexed by CPU core, not threadID! parmArray parm_t maxCPUs dup ({}) ; The following get triggered when a thread ; is done using a particular CPU. doneEvent qword ? ; printCS protects calls to print. ; accessCores protects writes to CoresToUse printCS CRITICAL_SECTION {} accessCores CRITICAL_SECTION {} ; Macros for EnterCriticalSection and LeaveCriticalSection ; to shorten the code a bit: enterCrit macro critsec lea rcx, critsec mov rdx, INFINITE call EnterCriticalSection endm leaveCrit macro critsec lea rcx, critsec call LeaveCriticalSection endm .code ; The worker thread: w_sRCXOfs = 16 w_sRBXOfs = -8 w_saveRCX textequ <[rbp+w_sRCXOfs]> w_saveEBX textequ <[rbp+w_sRBXOfs]> worker proc push rbp mov rbp, rsp sub rsp, 80 ; RCX contains the parameter pointer. It ; points to an entry in the parmArray array. mov w_saveEBX, rbx mov w_saveRCX, rcx ; Print a message announcing the actual start ; of this thread: enterCrit printCS mov rcx, w_saveRCX mov edx, (parm_t ptr [rcx]).threadNum mov r8d, (parm_t ptr [rcx]).affinity call print byte "Starting thread %d, affinity: %x", nl, 0 leaveCrit printCS ; Actually set the thread affinity for this thread: call GetCurrentThread mov rcx, w_saveRCX mov edx, (parm_t ptr [rcx]).affinity mov rcx, rax call SetThreadAffinityMask jnz doWork call GetLastError mov rbx, rax enterCrit printCS mov edx, ebx call print byte "Error in call to " byte "SetThreadAffinityMask: %d", nl, 0 leaveCrit printCS jmp workDone ; Simulate doing a lot of work by a ; compute-bound process: doWork: mov rbx, 100000000h rptLoop: dec rbx jnz rptLoop ; We're done with our core, free it for use by ; another thread. Note: must protect Read-modify-write ; access to the global CoresToUse variable. workDone: enterCrit accessCores mov rcx, w_saveRCX mov eax, (parm_t ptr [rcx]).affinity or CoresToUse, rax leaveCrit accessCores ; Terminate the thread: enterCrit printCS mov rcx, w_saveRCX mov edx, (parm_t ptr [rcx]).threadNum mov r8d, (parm_t ptr [rcx]).affinity call print byte "Terminating thread %d, affinity:%x", nl, 0 ; Signal the main thread that we've finished. Note: do this ; inside the printCS critical section so that other threads ; don't print data between leaving this critical section and ; signalling the event. mov rcx, doneEvent call SetEvent leaveCrit printCS mov rbx, saveRBX leave ret worker endp ; Here is the main assembly language function. rbxOfs = -8 rsiOfs = rbxOfs-8 raxOfs = rsiOfs-8 saveRBX textequ <[rbp+rbxOfs]> saveRSI textequ <[rbp+rsiOfs]> saveRAX textequ <[rbp+raxOfs]> public asmMain asmMain proc push rbp mov rbp, rsp sub rsp, 128 ;Locals and shadow storage mov saveRBX, rbx ;Preserve non-volatile RBX mov saveRSI, rsi ; and RSI registers. ; Initialize printCS and accessCores critical sections: lea rcx, printCS call InitializeCriticalSection lea rcx, accessCores call InitializeCriticalSection ; Set up an event to handle thread completion: xor rcx, rcx ;Default security xor rdx, rdx ;Auto reset xor r8, r8 ;Not signalled xor r9, r9 ;No name call CreateEvent mov doneEvent, rax ; Determine how many CPUs are available: call GetCurrentProcess mov hProcess, rax mov rcx, rax lea rdx, PrcsAffMsk lea r8, SysAffMsk call GetProcessAffinityMask ; We're not going to use CPUs 0 & 1, reserve them for system ; use and the main program. This code also assumes that there ; are 32 or fewer cores in the system (so we can use 32-bit ; arithmetic)-- adjust if you need to use more than 32 cores. ; ; The following also masks out the logical/virtual cores ; (the odd bits, the ones that use hyperthreading). ; Note: 055555554h = 0101 0101 0101 ... 0101 0100 mov rax, PrcsAffMsk and rax, 055555554h mov CoresToUse, rax mov Cores, rax ; If we had SSE4 guaranteed, we could use popcnt ; instruction here. Only executes once, so it's not worth ; testing to see if popcnt is available. xor ecx, ecx doPopCnt: shr eax, 1 adc ecx, 0 test eax, eax jnz doPopCnt mov edx, ecx ;Note: no threads yet, don't mov r8, CoresToUse ; have to protect this call print byte "CPUs available: %x, affinity:%x", nl, 0 xor rbx, rbx ;Worker thread number. rptLoop: enterCrit accessCores enterCrit printCS mov rdx, CoresToUse call print byte "Affinity to use:%x", nl, 0 ; Find the first available CPU in the affinity list. bsf rsi, rdx ;RDX=CoresToUse jnz gotCore leaveCrit printCS leaveCrit accessCores ; No cores available, wait for someone to quit. enterCrit printCS call print byte "Waiting for a core", nl, 0 leaveCrit printCS mov rcx, doneEvent mov edx, INFINITE call WaitForSingleObject jmp rptLoop ; Fill in the parameter information for the new worker ; thread. Note that this code entered the accessCores ; critical section earlier, so the following access is ; protected. gotCore: btr CoresToUse, rsi ;Clear the bit mov edx, ebx xor r8, r8 bts r8, rsi call print byte "Scheduling thread %d, affinity:%x", nl, 0 leaveCrit printCS leaveCrit accessCores ; Okay fill in the parameter data: lea rcx, parmArray mov rax, rsi ;RSI=index to CPU we're using mov (parm_t ptr [rcx+rax*8]).threadNum, ebx xor edx, edx bts edx, esi ;Generate this core's affinity mov (parm_t ptr [rcx+rax*8]).affinity, edx ; Start the thread: lea r9, [rcx+rax*8] ;Parameter data for thread xor rcx, rcx ;No security attributes xor rdx, rdx ;Default stack size lea r8, worker ;Address of thread code xor rax, rax ;Default thread flags mov [rsp+32], rax ;Must pass on stack. mov [rsp+40], rax ;Don't save threadID. call CreateThread test rax, rax ;Check for failure jz disaster inc ebx cmp ebx, numThreads jb rptLoop ; Finished scheduling all the threads enterCrit printCS call print byte "Done scheduling threads", nl, 0 leaveCrit printCS ; Once we've scheduled all the threads, wait for all ; the workers to complete before we quit. This code ; accomplishes that by waiting until CoresToUse ; returns to its original (pre-thread-scheduling) ; value. whlBusy: mov rax, CoresToUse ;Atomic read cmp rax, Cores je allDone ; Wait for some working thread to finish: mov rcx, doneEvent mov edx, INFINITE call WaitForSingleObject jmp whlBusy disaster: enterCrit printCS call print byte "Bad CreateThread call, quitting", nl, 0 leaveCrit printCS ; Terminate the program. ; Free up all the resources we used: allDone: lea rcx, printCS call DeleteCriticalSection lea rcx, accessCores call DeleteCriticalSection lea rcx, doneEvent call CloseHandle mov rbx, saveRBX mov rsi, saveRSI leave ret ;Returns to caller asmMain endp end