; aoaulib.inc ; ; Header file containing external function ; definitions, constants, and other items used ; by code in "The Art of 64-bit Assembly Language ; Volume 2" option casemap:none ifndef aoaulib_inc aoaulib_inc equ 0 ; Constant definitions: ; ; nl: newline constant ; tab: tab constant bs = 08 tab = 09 nl = 10 lf = 10 cr = 13 ; SSE4.2 feature flags (in ECX): SSE42 = 00180000h ;Bits 19 & 20 AVXSupport = 10000000h ;Bit 28 ; CPUID bits (EAX=7, EBX register) AVX2Support = 20h ;Bit 5 = AVX ;********************************************************** ; ; external data declarations: externdef ttlStr:word ;********************************************************** ; ; external function declarations: externdef wprint:qword externdef getwTitle:proc externdef readLine:proc ; Definition of C/C++ wprintf function that ; the wprint function will call (and some ; AoA sample programs call this directly, ; as well). externdef wprintf:proc ;********************************************************** ; ; utf16 macro to convert UTF-8 strings to ; wide strings (UTF-16). ; ; Inspired by the L.inc macro system ; by Ernest Murphy ; ; Note: utfInRange is a helper macro to see if ; the 2nd, 3rd, and 4th bytes of a UTF-8 ; multi-byte sequence are in the range 80h..0bfh. utfInRange macro value:req,txt1:req,txt2:req if (value lt 80h) or (value ge 0c0h) % echo txt1 byte of txt2 UTF-8 sequence echo must be in range 80h..0bfh. .err exitm <1> endif exitm <0> endm utf16 macro utf8Str:req local resultStr, eachChar, state local curChar, ndx, slen, chrCode local chASCII, word0, word1, wtxt0 local wtxt1, escPosn, firstChar local lastChar, delimChar curChar textequ <> resultStr textequ <> state textequ <"noPrefix"> slen sizestr word0 = 0 word1 = 0 ndx = 0 if slen gt 120 echo utf16 string argument is too long. echo Must be 120 characters or less. .err exitm <0> endif firstChar substr ,1,1 lastChar substr ,slen,1 ifdif firstChar, lastChar echo String deliminters must match .err exitm <0> endif delimChar instr 1, <"'>, firstChar if delimChar eq 0 echo String delimiter character must echo be " or ' .err exitm <0> endif ;; For each character in the string argument, ;; do the following: forc eachChar, <&utf8Str> ;; Skip the first and last characters of the string ;; (these will be the " or ' delimiter characters) if (ndx ne 0) and (ndx lt (slen-1)) ;;----------------------------------------------------- ;; ;; Handle characters with a "\" prefix character here. ;; ;; The "state" variable is set to "hasPrefix" if the ;; previous character was a "\" character. When that ;; happens, the following code converts the current ;; character as follows: ;; ;; \ -> \ ("\\" becomes a single "\") ;; | -> ! ("!" breaks MASM as it is an escape char) ;; n -> nl (new line character) ;; r -> CR (carriage return) ;; t -> tab ;; b -> backspace ;; [ -> < ("<" is reserved by MASM in text strings) ;; ] -> > (">" is also reserved by MASM) ;; { -> ( ("(" is reserved by MASM in text strings) ;; } -> ) (")" is also reserved by MASM) ;; 0 -> zero terminating word ;; anything else -> gets converted to its ASCII code ifidn state, <"hasPrefix"> escPosn instr 1,<\|nrtb[]{}0>,<&eachChar> if escPosn ne 0 ;; If we match one of the ESC characters that legally ;; follow a "\", convert the character to the new ;; hex value. ;; \ ! nl cr tab bs < > ( ) 0 curChar substr <5ch21h0ah0dh09h08h3ch3eh28h29h00h>,(escPosn-1)*3+1,3 else ;; "\" character before an arbitrary character. ;; Just convert it to it's ASCII code: chrCode catstr <">, <&eachChar>, <"> chASCII = chrCode curChar textequ %chASCII endif ;; After processing the prefixed character, return the ;; state back to "noPrefix" mode: state textequ <"noPrefix"> else ;;---------------------------------------------------- ;; ;; Look for a leading "\" character that begins ;; the prefix sequence: ifidn <&eachChar>, <\> state textequ <"hasPrefix"> curChar textequ <> ;;----------------------------------------------------- ;; ;; Dealing with non-prefixed characters at this point. ;; There are a couple of possibilities: ;; ;; 1) Plain Old ASCII character in the range 0-7fh ;; 2) UTF-8 2-byte sequence beginning with a byte in ;; the range 0c0h-0dfh, followed by a single byte ;; in the range 80h-0bfh. ;; 3) UTF-8 3-byte seqeuence betweening with a byte ;; in the range 0e0h-0efh, followed by two bytes ;; in the range 80h-0bfh. ;; 4) UTF-8 4-byte seqeuence betweening with a byte ;; in the range 0f0h-0ffh, followed by three bytes ;; in the range 80h-0bfh. ;; else ;; else must be a UTF-8 char ;; First, convert the character to an ASCII code so ;; we can compare its code numerically. chrCode catstr <">, <&eachChar>, <"> chASCII = chrCode ;;----------------------------------------------- ;; ;; The following IFxxx statements check to see if ;; this macro is in the process of handling a multi- ;; byte UTF-8 sequence. UTF-8 multi-byte states ;; are set as follows: ;; ;; Char code in range Bytes in sequence ;; 0c0h..0dfh 2 (state:"2-byte") ;; 0e0h..0efh 3 (states:"3-byte0" and ;; "3-byte1") ;; 0f0h..0ffh 4 (states:"4-byte0", ;; "4-byte1", ;; and "4-byte2") ;; ;; Bytes in the range 80h..0bfh are reserved for the ;; 1, 2, or 3 bytes that follow the first bytes of ;; a 2-, 3-, or 4-byte UTF-8 prefix value. ;; ;; ;; ;; Check for 2nd byte of a 2-byte UTF-8 sequence. ;; If we're in "2-byte" state, we've already seen ;; the byte with the 0c0h..0dfh value and we're ;; processing the second byte of the sequence here: ifidn state, <"2-byte"> if utfInRange(chASCII,2nd,2-byte) exitm <0> ;; Stop macro if error endif ;; Use LO six bits of this byte as the LO six bits ;; of UTF-16 value. Note that the HO five bits have ;; already been stored into bits 5..10 in word0. chASCII = word0 or (chASCII and 3fh) state textequ <"noPrefix"> curChar textequ %chASCII else ;;------------------------------------------------- ;; ;; Check for 2nd byte of a 3-byte UTF-8 sequence. ;; If we're in "3-byte0" state, we've already seen ;; the byte with the 0e0h..0efh value and we're ;; processing the second byte of the sequence here: ifidn state, <"3-byte0"> if utfInRange(chASCII,2nd,3-byte) exitm <0> ;; Stop macro if error endif ;; Use LO six bits of this byte as bits 6..11 of ;; UTF-16 value. Note that the HO four bits have ;; already been stored into bits 12..15 in word0. word0 = word0 or ((chASCII and 3fh) shl 6) state textequ <"3-byte1"> curChar textequ <> else ;;------------------------------------------------- ;; ;; Check for 3rd byte of a 3-byte UTF-8 sequence. ;; If we're in "3-byte1" state, we've already seen ;; the first two bytes and we're processing the ;; third byte of the sequence here: ifidn state, <"3-byte1"> if utfInRange(chASCII,3rd,3-byte) exitm <0> ;; Stop macro if error endif ;; Use LO six bits of this byte as bits 0..5 of ;; UTF-16 value. Note that the HO 10 bits have ;; already been stored into bits 6..15 in word0. chASCII = word0 or (chASCII and 3fh) state textequ <"noPrefix"> curChar textequ %chASCII else ;;------------------------------------------------- ;; ;; Check for 2nd byte of a 4-byte UTF-8 sequence. ;; If we're in "4-byte0" state, we've already seen ;; the byte with the 0f0h..0f7h value and we're ;; processing the second byte of the sequence here: ifidn state, <"4-byte0"> if utfInRange(chASCII,2nd,4-byte) exitm <0> ;; Stop macro if error endif ;; Use LO six bits of this byte as bits 12-17 of ;; UTF-32 value. Note that the HO three bits have ;; already been stored into bits 2..4 in word1. word1 = word1 or ((chASCII shr 4) and 3) word0 = (chASCII and 0fh) shl 12 state textequ <"4-byte1"> curChar textequ <> else ;;------------------------------------------------- ;; ;; Check for 3rd byte of a 4-byte UTF-8 sequence. ;; If we're in "4-byte1" state, we've already seen ;; the first two bytes and we're processing the ;; third byte of the sequence here: ifidn state, <"4-byte1"> if utfInRange(chASCII,3rd,4-byte) exitm <0> ;; Stop macro if error endif ;; Use LO six bits of this byte as bits 6-11 of ;; UTF-32 value. word0 = word0 or ((chASCII and 03fh) shl 6) state textequ <"4-byte2"> curChar textequ <> else ;;------------------------------------------------- ;; ;; Check for 4th byte of a 4-byte UTF-8 sequence. ;; If we're in "4-byte2" state, we've already seen ;; the first three bytes and we're processing the ;; fourth byte of the sequence here: ifidn state, <"4-byte2"> if utfInRange(chASCII,4th,4-byte) exitm <0> ;; Stop macro if error endif ;; Use LO six bits of this byte as bits 6-11 of ;; UTF-32 value. word0 = word0 or (chASCII and 03fh) state textequ <"noPrefix"> ;; Now must convert the 21-bit value in word0 and ;; word1 into a pair of UTF-16 values. The first ;; word to output contains 110110xxxxxxxxxx (where ;; xxxxxxxxxx represents the HO ten bits of the ;; value). The second word to output contains ;; 110111yyyyyyyyyy (where yyyyyyyyyy represents ;; the LO ten bits of the value). word1 = (word1 shl 6) word1 = word1 or (word0 and 0fc00h) shr 10 word1 = word1 or 0d800h word0 = (word0 and 3ffh) or 0dc00h ;; Convert those two words to integers and emit ;; to the operand stream: wtxt0 textequ %word0 wtxt1 textequ %word1 curChar catstr &wtxt1,<,>,&wtxt0 else ;;---------------------------------------------------- ;; ;; If we don't have a UTF-8 prefix, handle the ;; character code down here: if chASCII lt 080h ;; Standard ASCII character or an unexpected byte ;; in the range 80h..bfh. Emit its ASCII code as ;; the WORD value: curChar textequ %chASCII ;; As we're not in a UTF-8 prefix mode at this point, ;; character codes in the range 80h..0c1h and 0f8h..0ffh ;; are illegal. elseif (chASCII le 0c1h) or (chASCII ge 0f8h) ;; Encountered a byte with a code in the range 80h..0c1h or ;; 0f7h..0ffh. ;; This is unexpected at this point: curChar textequ %chASCII echo Unexpected 80h..0c1h (128..193) or echo 0f8h..0ffh (248..255) byte in string echo (not a legal UTF-8 prefix). % echo Unexpected code is curChar .err exitm <0> ;; Stop macro if error ;; Handle UTF-8 prefixes in the range 0c0h..0dfh here: elseif chASCII lt 0e0h ;; We've got a two-byte UTF-8 sequence. Save the ;; upper 5 bits in word0 and set the state accordingly: word0 = (chASCII and 1fh) shl 6 state textequ <"2-byte"> curChar textequ <> ;; Handle UTF-8 prefixes in the range 0e0h..0efh here: elseif chASCII lt 0f0h ;; We've got a three-byte UTF-8 sequence. Save the upper ;; 4 bits in word0 and set the state accordingly: word0 = (chASCII and 0fh) shl 12 state textequ <"3-byte0"> curChar textequ <> ;; Handle UTF-8 prefixes in the range 0f0h..0f7h here. ;; Note that codes in the range 0f8h..0ffh are reserved ;; (only need 0f0h..0f7h to produce 21-bit Unicode code ;; point values, and 21 bits fits into 0c0h..0f7h). ;; This code ignores the possibility of values beyond ;; 0f8h. else ;; Must be in range 0f0h..0ffh ;; We've got a four-byte UTF-8 sequence. Save the upper ;; 3 bits in word1 and set the state accordingly: word1 = (chASCII and 07h) shl 2 word0 = 0 state textequ <"4-byte0"> curChar textequ <> endif ;; chASCII lt 080h endif ;; state, <"4-byte2"> endif ;; state, <"4-byte1"> endif ;; state, <"4-byte0"> endif ;; state, <"3-byte1"> endif ;; state, <"3-byte0"> endif ;; state, <"2-byte"> endif ;; <&eachChar>, <\> endif ;; state, <"hasPrefix"> ;; Here's some clean up to produce syntactically ;; correct operands for the MASM WORD directive. ;; ;; resultStr is the current set of operand values ;; being constructed by this macro for the whole ;; UTF16 string. If it is the empty string, then ;; we are about to insert the very first operand ;; value into the string; If it is not the empty ;; string, then we're appending new values to ;; the end of existing values. In that case, we ;; need to emit a comma to separate the values. ifdif resultStr, <> ifidn state, <"noPrefix"> resultStr catstr resultStr, <,> endif ;; state, <"noPrefix"> endif ;; resultStr, <> ;; Whenever this macro processes some sort of ;; prefix value (either a "\" character or ;; a UTF-8 multi-byte prefix value), it leaves ;; the "curChar" variable empty. This is an ;; indication that this macro should not ;; yet append anything to the resultString. ifdif curChar, <> ;; Ignore if blank resultStr catstr resultStr, &curChar endif ;;curChar, <> endif ;;(ndx ne 0) and (ndx lt slen) ;; End of the forc loop (that processes each ;; character fed to this macro): ndx = ndx + 1 endm ;;forc ;; Return the result back to whoever ;; invoked this macro: exitm resultStr endm ;;utf16 ;---------------------------------------------------- endif ;aoaulib_inc