implementation module SaplTokenizer

import StdEnv, FastString

is_comment :: Token -> Bool
is_comment (TComment _) = True
is_comment _ = False

matchCharAt c1 s1 p 
	:== s1.[p] == c1

is_stopchar :: !Char -> Bool
is_stopchar char 
	= (char == '=') || (char == ':') || (char == ')') || (char == '(') || 
	  (char == '|') || (char == '{') || (char == '}') || (char == ',') || 
	  (char == '[') || (char == ']') || isSpace char

not_stopchar = not o is_stopchar
is_space c = isSpace c && not_eol c
not_eol c = not (c == '\n')

find_first_string :: !String !Int (Char String Int -> Bool) -> Int
find_first_string line start f
	| start == size line
		= size line
	| f line.[start-1] line start
		= find_first_string line (start + 1) f 
		= start

read_string_lit :: !Char !Int !String -> (!Int, !UString)
read_string_lit qc start str 
		# (nextbase, l) = decode start []
		= (nextbase, reverse l)
where
	decode p cs
		| eof p			 	= (p, cs)	
		| str.[p] == qc 	= (p+1, cs)
		| str.[p] == '\\' 	= let (n,c) = decodeBSChar (p+1) in decode n [c:cs]
							= decode (p+1) [fromChar str.[p]:cs]
	
	decodeBSChar p
		| eof p 			= (p  ,fromChar '\\')
		| str.[p] == '0'	= (p+1,fromInt 0)
		| str.[p] == 'a'	= (p+1,fromInt 7)
		| str.[p] == 'b'	= (p+1,fromChar '\b')
		| str.[p] == 'f'	= (p+1,fromChar '\f')		
		| str.[p] == 'n'	= (p+1,fromChar '\n')
		| str.[p] == 'r'	= (p+1,fromChar '\r')		
		| str.[p] == 't'	= (p+1,fromChar '\t')
		| str.[p] == 'v'	= (p+1,fromChar '\v')
		| str.[p] == '\''	= (p+1,fromChar '\'')
		| str.[p] == '"'	= (p+1,fromChar '"')
		| str.[p] == 'x'	= decodeHex 2 2 0
		| str.[p] == 'u'	= decodeHex 4 4 0					
		| str.[p] == 'U'	= decodeHex 8 8 0
							= (p+1,fromChar str.[p])	// skip the backslash otherwise
	where
		decodeHex :: !Int !Int !Int -> (!Int, !UChar)
		decodeHex len s acc
			| eof (p+len)	= (p+1, fromChar str.[p])
							= decodeHex` len s acc
		
		decodeHex` :: !Int !Int !Int -> (!Int, !UChar)
		decodeHex` len s acc 
			| s == 0			   = (p+len+1, fromInt acc)			
			| isHexDigit str.[p+s] = decodeHex` len (s-1) (acc+decodeTable.[len-s]*dti str.[p+s])
								   = (p+1, fromChar str.[p])
	
	eof p = p >= size str
	
	decodeTable :: {#Int}
	decodeTable = {1,16,256,4096,65536,1048576,16777216,268435456}
	
	dti :: !Char -> Int
	dti c 
		| c >= '0' && c <= '9' = toInt (c - '0')
		| c >= 'a' && c <= 'f' = 10 + toInt (c - 'a')
		| c >= 'A' && c <= 'F' = 10 + toInt (c - 'A')

read_token :: !Int !String -> (!Int, !Int, !Token)
read_token base line
	| start > ((size line)-1) 
		= rnoarg TEndOfLine 0	
	| matchCharAt '\n' line start
		= rnoarg TEndOfLine 1
	// Skip <{ and }> from the identifier. It's to help parsing "strange" function names
	| matchAt "!<{" line start
		# stop = find_first_string line (start+3) (\prev str base = not (matchAt "}>" str base))
		= return (TStrictIdentifier (line % (start + 3, stop - 1)), stop + 2)		
	| matchAt "<{" line start
		# stop = find_first_string line (start+2) (\prev str base = not (matchAt "}>" str base))
		= return (TIdentifier (line % (start + 2, stop - 1)), stop + 2)		
	| matchAt "=:" line start
		= rnoarg TCAFAssignmentOp 2
	| matchAt "::" line start
		= rnoarg TTypeDef 2		
	| matchAt "||" line start // skip comment
		# stop = (skipChars line (start+2) not_eol)
        = return ((TComment (line % (start + 2, stop - 1))), stop) // leave the EOL there. Note to Laszlo: don't replace this line anymore! We need it to properly detect module names! Grrrrr
	| matchCharAt '|' line start
		= rnoarg TVerticalBar 1		
	| matchCharAt '=' line start 
		= rnoarg TAssignmentOp 1
	| matchAt ":==" line start
		= rnoarg TMacroAssignmentOp 3
	| matchAt "->" line start
		= rnoarg TCaseAssignmentOp 2		
	| matchCharAt ',' line start 
		= rnoarg TComma 1
	| matchCharAt ':' line start 
		= rnoarg TColon 1		
	| matchCharAt '(' line start 
		= rnoarg TOpenParenthesis 1
	| matchCharAt ')' line start 
		= rnoarg TCloseParenthesis 1
	| matchCharAt '{' line start 
		= rnoarg TOpenBracket 1
	| matchCharAt '}' line start 
		= rnoarg TCloseBracket 1
	| matchCharAt '[' line start 
		= rnoarg TOpenSquareBracket 1
	| matchCharAt ']' line start 
		= rnoarg TCloseSquareBracket 1
	| matchCharAt '"' line start
		# (nextbase,ustr) = read_string_lit '"' (start+1) line
		= return (TLit (CString ustr), nextbase)
	| matchCharAt '\'' line start 
		# (nextbase,ustr) = read_string_lit '\'' (start+1) line
		= return (TLit (CChar ustr), nextbase)
	| matchCharAt '+' line start 
		= numberToken 1
	| matchCharAt '-' line start 
		= numberToken 1
// This is not a number!
//	| startsWith "." lline
//		= numberToken 0
	| isDigit (line.[start])
		= numberToken 0
	| otherwise
		# stop = skipChars line start not_stopchar
		= case tstr stop of
			"False"  = return (TLit (CBool False), stop)
			"false"  = return (TLit (CBool False), stop)
			"True"   = return (TLit (CBool True), stop)
			"true"   = return (TLit (CBool True), stop)
			"case"   = return (TCaseKeyword, stop)
			"select" = return (TSelectKeyword, stop)
			"update" = return (TUpdateKeyword, stop)
			"let"	 = return (TLetKeyword, stop)
			"in"	 = return (TInKeyword, stop)
			str		 = if (str.[0] == '!') 
							(return (TStrictIdentifier (str % (1, size str)), stop))
							(return (TIdentifier str, stop))
where
	tstr stop = line % (start, stop - 1)
	start = (skipChars line base is_space)
	
	rnoarg t length = (start, start + length, t)	
	return (a, newbase) = (start, newbase, a)
	
	numberToken p1
		# fpart = skipChars line (start+p1) isDigit
		# (dot, stop) = if ((size line) > fpart && line.[fpart] == '.') 
			(True, skipChars line (fpart+1) isDigit) (False, fpart)
		# (exp, stopexp) = readExp stop
		= return (TLit (if (dot || exp <> 0) 
					(CReal (toReal (tstr stop) * (10.0 ^ toReal (exp))))
					(CInt (toInt (tstr stop)))), stopexp)	
	
	readExp start
		| size line < start + 2
		= (0, start)
		| line.[start] <> 'e' && line.[start] <> 'E'
		= (0, start)
		# stop = skipChars line (start + 1 + signskip) isDigit
		= (sign * (toInt (line % (start + 1 + signskip, stop - 1))), stop)
	where
		sign = case line.[start+1] of
				'+' = 1
				'-' = -1
					= 1
		signskip = if (isDigit line.[start+1]) 0 1
	
tokensWithPositions :: !String -> [PosToken]
tokensWithPositions inp = tokens_ 1 1 0 [] 
where 
	finalise _ _ ts=:[PosToken _ _ TEndOfLine:_] = reverse ts
	finalise lp cp ts = reverse [PosToken lp cp TEndOfLine:ts]
	
	tokens_ lp cp base ts 
			= case base < (size inp) of
				True = let (cp2, newbase, t) = read_token base inp in 
							case t of
								TEndOfLine  = tokens_ (lp+1) 1 newbase [PosToken lp (cp+cp2-base) t:ts]
											= tokens_ lp (cp+newbase-base) newbase [PosToken lp (cp+cp2-base) t:ts]
					 = finalise lp cp ts
				
tokens :: !String -> [Token]
tokens inp = tokens_ 0 [] 
where 
	tokens_ base ts 
			= case base < (size inp) of
				True = let (_, newbase, t) = read_token base inp in tokens_ newbase [t:ts]
					 = reverse ts	

toStringR :: Real -> String
toStringR r = if (r - toReal(entier r) == 0.0) (toString r +++ ".0") (toString r)
 
instance toString Const	
where
	toString (CString str) = "string" //"\"" +++ str +++ "\""
	toString (CChar chr) = "char" // "'" +++ chr +++ "'"	
	toString (CInt int) = toString int
	toString (CReal real) = toStringR real
	toString (CBool True) = "True"
	toString (CBool False) = "False"
					 
instance toString Token	
where
	toString (TIdentifier name) = escape name
	toString (TStrictIdentifier name) = "!" +++ (escape name)
	toString (TComment comment) = "||" +++ comment
	toString TInlineAnnotation = ""	// skipChars
	toString TAssignmentOp = "="
	toString TMacroAssignmentOp = ":=="
	toString TCAFAssignmentOp = "=:"
	toString TLambda = "\\"
	toString TColon = ","
	toString TVerticalBar = "|"
	toString TOpenParenthesis = "("
	toString TCloseParenthesis = ")"
	toString TOpenBracket = "{"
	toString TCloseBracket = "}"
	toString TTypeDef = "::"
	toString (TLit const) = toString const
	toString TSelectKeyword = "select"
	toString TCaseKeyword = "if"
	toString TLetKeyword = "let"
	toString TInKeyword = "in"
	toString TEndOfLine = "E\n"
	toString _ = "M\n"	

escape f | ss f                            
	= "<{" +++ f +++ "}>"
	= f
where 
	ss f = or [is_ss c \\ c <-: f]
    is_ss c = not (isAlphanum c || c == '_' || c == '.')