; aoaulib.inc
;
;   Header file containing external function
; definitions, constants, and other items used
; by code in "The Art of 64-bit Assembly Language
; Volume 2"

            option          casemap:none

            ifndef      aoaulib_inc
aoaulib_inc equ         0


; Constant definitions:
;
; nl: newline constant
; tab: tab constant

bs          =       08
tab         =       09
nl          =       10
lf          =       10
cr          =       13

; SSE4.2 feature flags (in ECX):

SSE42       =       00180000h       ;Bits 19 & 20
AVXSupport  =       10000000h       ;Bit 28

; CPUID bits (EAX=7, EBX register)

AVX2Support  =      20h             ;Bit 5 = AVX


;**********************************************************
;
; external data declarations:
    
            externdef   ttlStr:word



;**********************************************************
;
; external function declarations:

            externdef   wprint:qword
            externdef   getwTitle:proc
	externdef   readLine:proc

; Definition of C/C++ wprintf function that
; the wprint function will call (and some
; AoA sample programs call this directly,
; as well).

            externdef   wprintf:proc
                            
                            
;**********************************************************
;
; utf16 macro to convert UTF-8 strings to
; wide strings (UTF-16). 
; 
; Inspired by the L.inc macro system 
; by Ernest Murphy
;
; Note: utfInRange is a helper macro to see if
; the 2nd, 3rd, and 4th bytes of a UTF-8
; multi-byte sequence are in the range 80h..0bfh. 

            
utfInRange  macro   value:req,txt1:req,txt2:req
            if      (value lt 80h) or (value ge 0c0h)
%           echo    txt1 byte of txt2 UTF-8 sequence 
            echo    must be in range 80h..0bfh.
            .err
            exitm   <1>
            endif
            exitm   <0>
            endm
            
            

utf16       macro   utf8Str:req 
            local   resultStr, eachChar, state
            local   curChar, ndx, slen, chrCode
            local   chASCII, word0, word1, wtxt0
            local   wtxt1, escPosn, firstChar
            local   lastChar, delimChar
             
curChar     textequ <> 
resultStr   textequ <> 
state       textequ <"noPrefix">
slen        sizestr <utf8Str>
word0       =       0
word1       =       0
ndx         =       0
 
            if      slen gt 120
            echo    utf16 string argument is too long.
            echo    Must be 120 characters or less.
            .err
            exitm   <0>
            endif
            
firstChar   substr <utf8Str>,1,1
lastChar    substr <utf8Str>,slen,1
            ifdif   firstChar, lastChar
            echo    String deliminters must match
            .err
            exitm   <0>
            endif

delimChar   instr   1, <"'>, firstChar
            if      delimChar eq 0
            echo    String delimiter character must
            echo    be " or '
            .err
            exitm   <0>
            endif   

;; For each character in the string argument, 
;; do the following:

            forc    eachChar, <&utf8Str>
            

;; Skip the first and last characters of the string
;; (these will be the " or ' delimiter characters)

            if      (ndx ne 0) and (ndx lt (slen-1))


;;-----------------------------------------------------
;;          
;; Handle characters with a "\" prefix character here.
;;
;; The "state" variable is set to "hasPrefix" if the
;; previous character was a "\" character. When that
;; happens, the following code converts the current
;; character as follows:
;;
;;  \ -> \  ("\\" becomes a single "\")
;;  | -> !  ("!" breaks MASM as it is an escape char)
;;  n -> nl (new line character)
;;  r -> CR (carriage return)
;;  t -> tab 
;;  b -> backspace 
;;  [ -> <  ("<" is reserved by MASM in text strings)
;;  ] -> >  (">" is also reserved by MASM)
;;  { -> (  ("(" is reserved by MASM in text strings)
;;  } -> )  (")" is also reserved by MASM)
;;  0 -> zero terminating word
;;  anything else -> gets converted to its ASCII code

            ifidn   state, <"hasPrefix">

escPosn     instr   1,<\|nrtb[]{}0>,<&eachChar>
            if      escPosn ne 0
            
;; If we match one of the ESC characters that legally
;; follow a "\", convert the character to the new
;; hex value.

;;                   \  ! nl cr tab bs <  >  (  )  0
curChar     substr <5ch21h0ah0dh09h08h3ch3eh28h29h00h>,(escPosn-1)*3+1,3


            else
            
;; "\" character before an arbitrary character. 
;; Just convert it to it's ASCII code:

chrCode     catstr  <">, <&eachChar>, <">
chASCII     =       chrCode
curChar     textequ %chASCII

            endif

;; After processing the prefixed character, return the
;; state back to "noPrefix" mode:

state       textequ <"noPrefix">
 
            else

;;----------------------------------------------------
;;
;; Look for a leading "\" character that begins
;; the prefix sequence:

            ifidn   <&eachChar>, <\> 
state       textequ <"hasPrefix"> 
curChar     textequ <> 
            


;;-----------------------------------------------------
;; 
;; Dealing with non-prefixed characters at this point. 
;; There are a couple of possibilities:
;;
;; 1)  Plain Old ASCII character in the range 0-7fh
;; 2)  UTF-8 2-byte sequence beginning with a byte in 
;;     the range 0c0h-0dfh, followed by a single byte 
;;     in the range 80h-0bfh.
;; 3)  UTF-8 3-byte seqeuence betweening with a byte 
;;     in the range 0e0h-0efh, followed by two bytes 
;;     in the range 80h-0bfh.
;; 4)  UTF-8 4-byte seqeuence betweening with a byte 
;;     in the range 0f0h-0ffh, followed by three bytes 
;;     in the range 80h-0bfh.
;;

            else    ;; else must be a UTF-8 char
            
;; First, convert the character to an ASCII code so 
;; we can compare its code numerically.

chrCode     catstr  <">, <&eachChar>, <">
chASCII     =       chrCode


;;-----------------------------------------------
;;
;; The following IFxxx statements check to see if
;; this macro is in the process of handling a multi-
;; byte UTF-8 sequence. UTF-8 multi-byte states
;; are set as follows:
;;
;; Char code in range       Bytes in sequence
;;    0c0h..0dfh            2 (state:"2-byte")
;;    0e0h..0efh            3 (states:"3-byte0" and
;;                                       "3-byte1")
;;    0f0h..0ffh            4 (states:"4-byte0",
;;                              "4-byte1", 
;;                               and "4-byte2")
;;
;; Bytes in the range 80h..0bfh are reserved for the
;; 1, 2, or 3 bytes that follow the first bytes of
;; a 2-, 3-, or 4-byte UTF-8 prefix value.
;;
;;
;;          
;; Check for 2nd byte of a 2-byte UTF-8 sequence.
;; If we're in "2-byte" state, we've already seen
;; the byte with the 0c0h..0dfh value and we're
;; processing the second byte of the sequence here:

            ifidn   state, <"2-byte">

            if      utfInRange(chASCII,2nd,2-byte)
            exitm   <0>     ;; Stop macro if error
            endif
            
;; Use LO six bits of this byte as the LO six bits 
;; of UTF-16 value. Note that the HO five bits have 
;; already been stored into bits 5..10 in word0.

chASCII     =       word0 or (chASCII and 3fh)
state       textequ <"noPrefix">
curChar     textequ %chASCII        
            
            else
            
            
;;-------------------------------------------------
;;          
;; Check for 2nd byte of a 3-byte UTF-8 sequence.
;; If we're in "3-byte0" state, we've already seen
;; the byte with the 0e0h..0efh value and we're
;; processing the second byte of the sequence here:

            ifidn   state, <"3-byte0">
            
            if      utfInRange(chASCII,2nd,3-byte)
            exitm   <0>     ;; Stop macro if error
            endif
            
;; Use LO six bits of this byte as bits 6..11 of 
;; UTF-16 value. Note that the HO four bits have 
;; already been stored into bits 12..15 in word0.

word0       =       word0 or ((chASCII and 3fh) shl 6)
state       textequ <"3-byte1">
curChar     textequ <>      

            
            
            else
            
            
;;-------------------------------------------------
;;          
;; Check for 3rd byte of a 3-byte UTF-8 sequence.
;; If we're in "3-byte1" state, we've already seen
;; the first two bytes and we're processing the 
;; third byte of the sequence here:

            ifidn   state, <"3-byte1">
            
            if      utfInRange(chASCII,3rd,3-byte)
            exitm   <0>     ;; Stop macro if error
            endif
            
;; Use LO six bits of this byte as bits 0..5 of 
;; UTF-16 value. Note that the HO 10 bits have 
;; already been stored into bits 6..15 in word0.

chASCII     =       word0 or (chASCII and 3fh)
state       textequ <"noPrefix">
curChar     textequ %chASCII

            else
            
            
            
;;-------------------------------------------------
;;          
;; Check for 2nd byte of a 4-byte UTF-8 sequence.
;; If we're in "4-byte0" state, we've already seen
;; the byte with the 0f0h..0f7h value and we're
;; processing the second byte of the sequence here:

            ifidn   state, <"4-byte0">
            
            if      utfInRange(chASCII,2nd,4-byte)
            exitm   <0>     ;; Stop macro if error
            endif
            
;; Use LO six bits of this byte as bits 12-17 of 
;; UTF-32 value. Note that the HO three bits have 
;; already been stored into bits 2..4 in word1.

word1       =       word1 or ((chASCII shr 4) and 3)
word0       =       (chASCII and 0fh) shl 12
state       textequ <"4-byte1">
curChar     textequ <>      
            
            else
            
            
            
;;-------------------------------------------------
;;          
;; Check for 3rd byte of a 4-byte UTF-8 sequence.
;; If we're in "4-byte1" state, we've already seen
;; the first two bytes and we're processing the 
;; third byte of the sequence here:

            ifidn   state, <"4-byte1">
            
            if      utfInRange(chASCII,3rd,4-byte)
            exitm   <0>     ;; Stop macro if error
            endif
            
;; Use LO six bits of this byte as bits 6-11 of 
;; UTF-32 value.

word0       =       word0 or ((chASCII and 03fh) shl 6)
state       textequ <"4-byte2">
curChar     textequ <>      


            else
            
            
            
;;-------------------------------------------------
;;          
;; Check for 4th byte of a 4-byte UTF-8 sequence.
;; If we're in "4-byte2" state, we've already seen
;; the first three bytes and we're processing the 
;; fourth byte of the sequence here:

            ifidn   state, <"4-byte2">
            
            if      utfInRange(chASCII,4th,4-byte)
            exitm   <0>     ;; Stop macro if error
            endif
            
;; Use LO six bits of this byte as bits 6-11 of 
;; UTF-32 value.

word0       =       word0 or (chASCII and 03fh)
state       textequ <"noPrefix">

;; Now must convert the 21-bit value in word0 and
;; word1 into a pair of UTF-16 values. The first
;; word to output contains 110110xxxxxxxxxx (where
;; xxxxxxxxxx represents the HO ten bits of the
;; value). The second word to output contains
;; 110111yyyyyyyyyy (where yyyyyyyyyy represents
;; the LO ten bits of the value). 

word1       =       (word1 shl 6)
word1       =       word1 or (word0 and 0fc00h) shr 10
word1       =       word1 or 0d800h

word0       =       (word0 and 3ffh) or 0dc00h 

;; Convert those two words to integers and emit
;; to the operand stream:

wtxt0       textequ %word0
wtxt1       textequ %word1
curChar     catstr  &wtxt1,<,>,&wtxt0


            else
            
;;----------------------------------------------------
;;          
;; If we don't have a UTF-8 prefix, handle the 
;; character code down here:

            if      chASCII lt 080h

;; Standard ASCII character or an unexpected byte 
;; in the range 80h..bfh. Emit its ASCII code as 
;; the WORD value:

curChar     textequ %chASCII


;; As we're not in a UTF-8 prefix mode at this point,
;; character codes in the range 80h..0c1h and 0f8h..0ffh
;; are illegal.

            elseif  (chASCII le 0c1h) or (chASCII ge 0f8h)
            
;; Encountered a byte with a code in the range 80h..0c1h or
;; 0f7h..0ffh.
;; This is unexpected at this point:

curChar     textequ %chASCII

            echo    Unexpected 80h..0c1h (128..193) or 
            echo    0f8h..0ffh (248..255) byte in string
            echo    (not a legal UTF-8 prefix).
%           echo    Unexpected code is curChar
            .err
            exitm   <0>     ;; Stop macro if error


;; Handle UTF-8 prefixes in the range 0c0h..0dfh here:

            elseif  chASCII lt 0e0h
                                                            
;; We've got a two-byte UTF-8 sequence. Save the 
;; upper 5 bits in word0 and set the state accordingly:

word0       =       (chASCII and 1fh) shl 6
state       textequ <"2-byte">
curChar     textequ  <>



;; Handle UTF-8 prefixes in the range 0e0h..0efh here:

            elseif  chASCII lt 0f0h
            
;; We've got a three-byte UTF-8 sequence. Save the upper 
;; 4 bits in word0 and set the state accordingly:
 
word0       =       (chASCII and 0fh) shl 12
state       textequ <"3-byte0">
curChar     textequ  <>

;; Handle UTF-8 prefixes in the range 0f0h..0f7h here.
;; Note that codes in the range 0f8h..0ffh are reserved
;; (only need 0f0h..0f7h to produce 21-bit Unicode code
;; point values, and 21 bits fits into 0c0h..0f7h).
;; This code ignores the possibility of values beyond
;; 0f8h.

            else    ;; Must be in range 0f0h..0ffh
            
;; We've got a four-byte UTF-8 sequence. Save the upper 
;; 3 bits in word1 and set the state accordingly:

word1       =       (chASCII and 07h) shl 2
word0       =       0
state       textequ <"4-byte0">
curChar     textequ  <>

            endif   ;; chASCII lt 080h
            endif   ;; state, <"4-byte2">
            endif   ;; state, <"4-byte1">
            endif   ;; state, <"4-byte0">
            endif   ;; state, <"3-byte1">
            endif   ;; state, <"3-byte0">
            endif   ;; state, <"2-byte"> 
            endif   ;; <&eachChar>, <\>
            endif   ;; state, <"hasPrefix">
             
             
;; Here's some clean up to produce syntactically
;; correct operands for the MASM WORD directive.
;;
;; resultStr is the current set of operand values
;; being constructed by this macro for the whole
;; UTF16 string. If it is the empty string, then
;; we are about to insert the very first operand
;; value into the string; If it is not the empty
;; string, then we're appending new values to
;; the end of existing values. In that case, we
;; need to emit a comma to separate the values.
             
            ifdif   resultStr, <>
             
            ifidn   state, <"noPrefix"> 
resultStr   catstr  resultStr, <,> 
            endif   ;; state, <"noPrefix">
             
            endif   ;; resultStr, <> 
            
;; Whenever this macro processes some sort of
;; prefix value (either a "\" character or
;; a UTF-8 multi-byte prefix value), it leaves
;; the "curChar" variable empty. This is an
;; indication that this macro should not
;; yet append anything to the resultString.
 
            ifdif   curChar, <> ;; Ignore if blank 
resultStr   catstr  resultStr, &curChar 
            endif   ;;curChar, <>
            
            
            endif   ;;(ndx ne 0) and (ndx lt slen)

;; End of the forc loop (that processes each
;; character fed to this macro):
                    
ndx         =       ndx + 1
            endm    ;;forc

;; Return the result back to whoever
;; invoked this macro:
            
            exitm   resultStr 
            endm    ;;utf16

;----------------------------------------------------
            
            
            endif       ;aoaulib_inc