implementation module UChar import StdClass, StdOverloaded, StdInt, StdBool from StdChar import instance fromChar Int import code from "bsearch." import code from "WCsubst." // Sysnonym type is not quite good here... causes every kind of problems with toInt, fromInt ... :: UChar = UChar !Int :: GeneralCategory = UppercaseLetter // ^ Lu: Letter, Uppercase | LowercaseLetter // ^ Ll: Letter, Lowercase | TitlecaseLetter // ^ Lt: Letter, Titlecase | ModifierLetter // ^ Lm: Letter, Modifier | OtherLetter // ^ Lo: Letter, Other | NonSpacingMark // ^ Mn: Mark, Non-Spacing | SpacingCombiningMark // ^ Mc: Mark, Spacing Combining | EnclosingMark // ^ Me: Mark, Enclosing | DecimalNumber // ^ Nd: Number, Decimal | LetterNumber // ^ Nl: Number, Letter | OtherNumber // ^ No: Number, Other | ConnectorPunctuation // ^ Pc: Punctuation, Connector | DashPunctuation // ^ Pd: Punctuation, Dash | OpenPunctuation // ^ Ps: Punctuation, Open | ClosePunctuation // ^ Pe: Punctuation, Close | InitialQuote // ^ Pi: Punctuation, Initial quote | FinalQuote // ^ Pf: Punctuation, Final quote | OtherPunctuation // ^ Po: Punctuation, Other | MathSymbol // ^ Sm: Symbol, Math | CurrencySymbol // ^ Sc: Symbol, Currency | ModifierSymbol // ^ Sk: Symbol, Modifier | OtherSymbol // ^ So: Symbol, Other | Space // ^ Zs: Separator, Space | LineSeparator // ^ Zl: Separator, Line | ParagraphSeparator // ^ Zp: Separator, Paragraph | Control // ^ Cc: Other, Control | Format // ^ Cf: Other, Format | Surrogate // ^ Cs: Other, Surrogate | PrivateUse // ^ Co: Other, Private Use | NotAssigned // ^ Cn: Other, Not Assigned wgencat :: !Int -> Int wgencat ch = code inline { ccall u_gencat "I:I" } iswalpha :: !Int -> Int iswalpha ch = code inline { ccall u_iswalpha "I:I" } iswalnum :: !Int -> Int iswalnum ch = code inline { ccall u_iswalnum "I:I" } iswcntrl :: !Int -> Int iswcntrl ch = code inline { ccall u_iswcntrl "I:I" } iswspace :: !Int -> Int iswspace ch = code inline { ccall u_iswspace "I:I" } iswprint :: !Int -> Int iswprint ch = code inline { ccall u_iswprint "I:I" } iswlower :: !Int -> Int iswlower ch = code inline { ccall u_iswlower "I:I" } iswupper :: !Int -> Int iswupper ch = code inline { ccall u_iswupper "I:I" } towlower :: !Int -> Int towlower ch = code inline { ccall u_towlower "I:I" } towupper :: !Int -> Int towupper ch = code inline { ccall u_towupper "I:I" } towtitle :: !Int -> Int towtitle ch = code inline { ccall u_towtitle "I:I" } instance fromInt GeneralCategory where fromInt 0 = UppercaseLetter // ^ Lu: Letter, Uppercase fromInt 1 = LowercaseLetter // ^ Ll: Letter, Lowercase fromInt 2 = TitlecaseLetter // ^ Lt: Letter, Titlecase fromInt 3 = ModifierLetter // ^ Lm: Letter, Modifier fromInt 4 = OtherLetter // ^ Lo: Letter, Other fromInt 5 = NonSpacingMark // ^ Mn: Mark, Non-Spacing fromInt 6 = SpacingCombiningMark // ^ Mc: Mark, Spacing Combining fromInt 7 = EnclosingMark // ^ Me: Mark, Enclosing fromInt 8 = DecimalNumber // ^ Nd: Number, Decimal fromInt 9 = LetterNumber // ^ Nl: Number, Letter fromInt 10 = OtherNumber // ^ No: Number, Other fromInt 11 = ConnectorPunctuation // ^ Pc: Punctuation, Connector fromInt 12 = DashPunctuation // ^ Pd: Punctuation, Dash fromInt 13 = OpenPunctuation // ^ Ps: Punctuation, Open fromInt 14 = ClosePunctuation // ^ Pe: Punctuation, Close fromInt 15 = InitialQuote // ^ Pi: Punctuation, Initial quote fromInt 16 = FinalQuote // ^ Pf: Punctuation, Final quote fromInt 17 = OtherPunctuation // ^ Po: Punctuation, Other fromInt 18 = MathSymbol // ^ Sm: Symbol, Math fromInt 19 = CurrencySymbol // ^ Sc: Symbol, Currency fromInt 20 = ModifierSymbol // ^ Sk: Symbol, Modifier fromInt 21 = OtherSymbol // ^ So: Symbol, Other fromInt 22 = Space // ^ Zs: Separator, Space fromInt 23 = LineSeparator // ^ Zl: Separator, Line fromInt 24 = ParagraphSeparator // ^ Zp: Separator, Paragraph fromInt 25 = Control // ^ Cc: Other, Control fromInt 26 = Format // ^ Cf: Other, Format fromInt 27 = Surrogate // ^ Cs: Other, Surrogate fromInt 28 = PrivateUse // ^ Co: Other, Private Use fromInt 29 = NotAssigned // ^ Cn: Other, Not Assigned generalCategory c = fromInt (wgencat (toInt c)) // | Selects alphabetic Unicode characters (lower-case, upper-case and // title-case letters, plus letters of caseless scripts and modifiers letters). // This function is equivalent to 'Data.Char.isAlpha'. isLetter :: UChar -> Bool isLetter c = case generalCategory c of UppercaseLetter -> True LowercaseLetter -> True TitlecaseLetter -> True ModifierLetter -> True OtherLetter -> True _ -> False // | Selects Unicode mark characters, e.g. accents and the like, which // combine with preceding letters. isMark :: UChar -> Bool isMark c = case generalCategory c of NonSpacingMark -> True SpacingCombiningMark -> True EnclosingMark -> True _ -> False // | Selects Unicode numeric characters, including digits from various // scripts, Roman numerals, etc. isNumber :: UChar -> Bool isNumber c = case generalCategory c of DecimalNumber -> True LetterNumber -> True OtherNumber -> True _ -> False // | Selects Unicode punctuation characters, including various kinds // of connectors, brackets and quotes. isPunctuation :: UChar -> Bool isPunctuation c = case generalCategory c of ConnectorPunctuation -> True DashPunctuation -> True OpenPunctuation -> True ClosePunctuation -> True InitialQuote -> True FinalQuote -> True OtherPunctuation -> True _ -> False // | Selects Unicode symbol characters, including mathematical and // currency symbols. isSymbol :: UChar -> Bool isSymbol c = case generalCategory c of MathSymbol -> True CurrencySymbol -> True ModifierSymbol -> True OtherSymbol -> True _ -> False // | Selects Unicode space and separator characters. isSeparator :: UChar -> Bool isSeparator c = case generalCategory c of Space -> True LineSeparator -> True ParagraphSeparator -> True _ -> False // | Selects alphabetic Unicode characters (lower-case, upper-case and // title-case letters, plus letters of caseless scripts and modifiers letters). // This function is equivalent to 'Data.Char.isLetter'. isAlpha :: UChar -> Bool isAlpha (UChar c) = not (iswalpha c == 0) // | Selects alphabetic or numeric digit Unicode characters. // // Note that numeric digits outside the ASCII range are selected by this // function but not by 'isDigit'. Such digits may be part of identifiers // but are not used by the printer and reader to represent numbers. isAlphaNum :: UChar -> Bool isAlphaNum (UChar c) = not (iswalnum c == 0) // | Selects the first 128 characters of the Unicode character set, // corresponding to the ASCII character set. isAscii :: UChar -> Bool isAscii (UChar c) = c < 0x80 // | Selects the first 256 characters of the Unicode character set, // corresponding to the ISO 8859-1 (Latin-1) character set. isLatin1 :: UChar -> Bool isLatin1 (UChar c) = c <= 0xff // | Selects ASCII lower-case letters, // i.e. characters satisfying both 'isAscii' and 'isLower'. isAsciiLower :: UChar -> Bool isAsciiLower (UChar c) = c >= fromChar 'a' && c <= fromChar 'z' // | Selects ASCII upper-case letters, // i.e. characters satisfying both 'isAscii' and 'isUpper'. isAsciiUpper :: UChar -> Bool isAsciiUpper (UChar c) = c >= fromChar 'A' && c <= fromChar 'Z' // | Returns 'True' for any Unicode space character, and the control // characters @\\t@, @\\n@, @\\r@, @\\f@, @\\v@. // isSpace includes non-breaking space // Done with explicit equalities both for efficiency, and to avoid a tiresome // recursion with GHC.List elem isSpace :: UChar -> Bool isSpace (UChar c) = c == fromChar ' ' || c == fromChar '\t' || c == fromChar '\n' || c == fromChar '\r' || c == fromChar '\f' || c == fromChar '\v' || c == 0xa0 || not (iswspace c == 0) // | Selects control characters, which are the non-printing characters of // the Latin-1 subset of Unicode. isControl :: UChar -> Bool isControl (UChar c) = not (iswcntrl c == 0) // | Selects printable Unicode characters // (letters, numbers, marks, punctuation, symbols and spaces). isPrint :: UChar -> Bool isPrint (UChar c) = not (iswprint c == 0) // | Selects upper-case or title-case alphabetic Unicode characters (letters). // Title case is used by a small number of letter ligatures like the // single-character form of /Lj/. isUpper :: UChar -> Bool isUpper (UChar c) = not (iswupper c == 0) // | Selects lower-case alphabetic Unicode characters (letters). isLower :: UChar -> Bool isLower (UChar c) = not (iswlower c == 0) // | Selects ASCII digits, i.e. @\'0\'@..@\'9\'@. isDigit :: UChar -> Bool isDigit (UChar c) = c >= fromChar '0' && c <= fromChar '9' // | Selects ASCII octal digits, i.e. @\'0\'@..@\'7\'@. isOctDigit :: UChar -> Bool isOctDigit (UChar c) = c >= fromChar '0' && c <= fromChar '7' // | Selects ASCII hexadecimal digits, // i.e. @\'0\'@..@\'9\'@, @\'a\'@..@\'f\'@, @\'A\'@..@\'F\'@. isHexDigit :: UChar -> Bool isHexDigit o=:(UChar c) = isDigit o || c >= fromChar 'A' && c <= fromChar 'F' || c >= fromChar 'a' && c <= fromChar 'f' // | Convert a letter to the corresponding lower-case letter, if any. // Any other character is returned unchanged. toLower :: UChar -> UChar toLower (UChar c) = fromInt (towlower c) // | Convert a letter to the corresponding upper-case letter, if any. // Any other character is returned unchanged. toUpper :: UChar -> UChar toUpper (UChar c) = fromInt (towupper c) // | Convert a letter to the corresponding title-case or upper-case // letter, if any. (Title case differs from upper case only for a small // number of ligature letters.) // Any other character is returned unchanged. toTitle :: UChar -> UChar toTitle (UChar c) = fromInt (towtitle c) instance == UChar where (==) (UChar x) (UChar y) = x == y instance < UChar where (<) (UChar x) (UChar y) = x < y // TODO: correct? instance fromInt UChar where fromInt i = UChar i instance fromChar UChar where fromChar c = UChar (toInt c) instance toChar UChar where toChar (UChar c) = fromInt c instance toInt UChar where toInt (UChar c) = c