(index<- ) ./libcore/char.rs
git branch: * master 5200215 auto merge of #14035 : alexcrichton/rust/experimental, r=huonw
modified: Fri May 9 13:02:28 2014
1 // Copyright 2012-2013 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
4 //
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
10
11 //! Character manipulation (`char` type, Unicode Scalar Value)
12 //!
13 //! This module provides the `Char` trait, as well as its implementation
14 //! for the primitive `char` type, in order to allow basic character manipulation.
15 //!
16 //! A `char` actually represents a
17 //! *[Unicode Scalar Value](http://www.unicode.org/glossary/#unicode_scalar_value)*,
18 //! as it can contain any Unicode code point except high-surrogate and
19 //! low-surrogate code points.
20 //!
21 //! As such, only values in the ranges \[0x0,0xD7FF\] and \[0xE000,0x10FFFF\]
22 //! (inclusive) are allowed. A `char` can always be safely cast to a `u32`;
23 //! however the converse is not always true due to the above range limits
24 //! and, as such, should be performed via the `from_u32` function..
25
26
27 use cast::transmute;
28 use option::{None, Option, Some};
29 use iter::{Iterator, range_step};
30 use unicode::{derived_property, property, general_category, decompose, conversions};
31
32 #[cfg(not(test))] use cmp::{Eq, Ord, TotalEq, TotalOrd, Ordering};
33 #[cfg(not(test))] use default::Default;
34
35 // UTF-8 ranges and tags for encoding characters
36 static TAG_CONT: uint = 128u;
37 static MAX_ONE_B: uint = 128u;
38 static TAG_TWO_B: uint = 192u;
39 static MAX_TWO_B: uint = 2048u;
40 static TAG_THREE_B: uint = 224u;
41 static MAX_THREE_B: uint = 65536u;
42 static TAG_FOUR_B: uint = 240u;
43
44 /*
45 Lu Uppercase_Letter an uppercase letter
46 Ll Lowercase_Letter a lowercase letter
47 Lt Titlecase_Letter a digraphic character, with first part uppercase
48 Lm Modifier_Letter a modifier letter
49 Lo Other_Letter other letters, including syllables and ideographs
50 Mn Nonspacing_Mark a nonspacing combining mark (zero advance width)
51 Mc Spacing_Mark a spacing combining mark (positive advance width)
52 Me Enclosing_Mark an enclosing combining mark
53 Nd Decimal_Number a decimal digit
54 Nl Letter_Number a letterlike numeric character
55 No Other_Number a numeric character of other type
56 Pc Connector_Punctuation a connecting punctuation mark, like a tie
57 Pd Dash_Punctuation a dash or hyphen punctuation mark
58 Ps Open_Punctuation an opening punctuation mark (of a pair)
59 Pe Close_Punctuation a closing punctuation mark (of a pair)
60 Pi Initial_Punctuation an initial quotation mark
61 Pf Final_Punctuation a final quotation mark
62 Po Other_Punctuation a punctuation mark of other type
63 Sm Math_Symbol a symbol of primarily mathematical use
64 Sc Currency_Symbol a currency sign
65 Sk Modifier_Symbol a non-letterlike modifier symbol
66 So Other_Symbol a symbol of other type
67 Zs Space_Separator a space character (of various non-zero widths)
68 Zl Line_Separator U+2028 LINE SEPARATOR only
69 Zp Paragraph_Separator U+2029 PARAGRAPH SEPARATOR only
70 Cc Control a C0 or C1 control code
71 Cf Format a format control character
72 Cs Surrogate a surrogate code point
73 Co Private_Use a private-use character
74 Cn Unassigned a reserved unassigned code point or a noncharacter
75 */
76
77 /// The highest valid code point
78 pub static MAX: char = '\U0010ffff';
79
80 /// Converts from `u32` to a `char`
81 #[inline]
82 pub fn from_u32(i: u32) -> Option<char> {
83 // catch out-of-bounds and surrogates
84 if (i > MAX as u32) || (i >= 0xD800 && i <= 0xDFFF) {
85 None
86 } else {
87 Some(unsafe { transmute(i) })
88 }
89 }
90
91 /// Returns whether the specified `char` is considered a Unicode alphabetic
92 /// code point
93 pub fn is_alphabetic(c: char) -> bool { derived_property::Alphabetic(c) }
94
95 /// Returns whether the specified `char` satisfies the 'XID_Start' Unicode property
96 ///
97 /// 'XID_Start' is a Unicode Derived Property specified in
98 /// [UAX #31](http://unicode.org/reports/tr31/#NFKC_Modifications),
99 /// mostly similar to ID_Start but modified for closure under NFKx.
100 pub fn is_XID_start(c: char) -> bool { derived_property::XID_Start(c) }
101
102 /// Returns whether the specified `char` satisfies the 'XID_Continue' Unicode property
103 ///
104 /// 'XID_Continue' is a Unicode Derived Property specified in
105 /// [UAX #31](http://unicode.org/reports/tr31/#NFKC_Modifications),
106 /// mostly similar to 'ID_Continue' but modified for closure under NFKx.
107 pub fn is_XID_continue(c: char) -> bool { derived_property::XID_Continue(c) }
108
109 ///
110 /// Indicates whether a `char` is in lower case
111 ///
112 /// This is defined according to the terms of the Unicode Derived Core Property 'Lowercase'.
113 ///
114 #[inline]
115 pub fn is_lowercase(c: char) -> bool { derived_property::Lowercase(c) }
116
117 ///
118 /// Indicates whether a `char` is in upper case
119 ///
120 /// This is defined according to the terms of the Unicode Derived Core Property 'Uppercase'.
121 ///
122 #[inline]
123 pub fn is_uppercase(c: char) -> bool { derived_property::Uppercase(c) }
124
125 ///
126 /// Indicates whether a `char` is whitespace
127 ///
128 /// Whitespace is defined in terms of the Unicode Property 'White_Space'.
129 ///
130 #[inline]
131 pub fn is_whitespace(c: char) -> bool {
132 // As an optimization ASCII whitespace characters are checked separately
133 c == ' '
134 || ('\x09' <= c && c <= '\x0d')
135 || property::White_Space(c)
136 }
137
138 ///
139 /// Indicates whether a `char` is alphanumeric
140 ///
141 /// Alphanumericness is defined in terms of the Unicode General Categories
142 /// 'Nd', 'Nl', 'No' and the Derived Core Property 'Alphabetic'.
143 ///
144 #[inline]
145 pub fn is_alphanumeric(c: char) -> bool {
146 derived_property::Alphabetic(c)
147 || general_category::Nd(c)
148 || general_category::Nl(c)
149 || general_category::No(c)
150 }
151
152 ///
153 /// Indicates whether a `char` is a control code point
154 ///
155 /// Control code points are defined in terms of the Unicode General Category
156 /// 'Cc'.
157 ///
158 #[inline]
159 pub fn is_control(c: char) -> bool { general_category::Cc(c) }
160
161 /// Indicates whether the `char` is numeric (Nd, Nl, or No)
162 #[inline]
163 pub fn is_digit(c: char) -> bool {
164 general_category::Nd(c)
165 || general_category::Nl(c)
166 || general_category::No(c)
167 }
168
169 ///
170 /// Checks if a `char` parses as a numeric digit in the given radix
171 ///
172 /// Compared to `is_digit()`, this function only recognizes the
173 /// characters `0-9`, `a-z` and `A-Z`.
174 ///
175 /// # Return value
176 ///
177 /// Returns `true` if `c` is a valid digit under `radix`, and `false`
178 /// otherwise.
179 ///
180 /// # Failure
181 ///
182 /// Fails if given a `radix` > 36.
183 ///
184 /// # Note
185 ///
186 /// This just wraps `to_digit()`.
187 ///
188 #[inline]
189 pub fn is_digit_radix(c: char, radix: uint) -> bool {
190 match to_digit(c, radix) {
191 Some(_) => true,
192 None => false,
193 }
194 }
195
196 ///
197 /// Converts a `char` to the corresponding digit
198 ///
199 /// # Return value
200 ///
201 /// If `c` is between '0' and '9', the corresponding value
202 /// between 0 and 9. If `c` is 'a' or 'A', 10. If `c` is
203 /// 'b' or 'B', 11, etc. Returns none if the `char` does not
204 /// refer to a digit in the given radix.
205 ///
206 /// # Failure
207 ///
208 /// Fails if given a `radix` outside the range `[0..36]`.
209 ///
210 #[inline]
211 pub fn to_digit(c: char, radix: uint) -> Option<uint> {
212 if radix > 36 {
213 fail!("to_digit: radix is too high (maximum 36)");
214 }
215 let val = match c {
216 '0' .. '9' => c as uint - ('0' as uint),
217 'a' .. 'z' => c as uint + 10u - ('a' as uint),
218 'A' .. 'Z' => c as uint + 10u - ('A' as uint),
219 _ => return None,
220 };
221 if val < radix { Some(val) }
222 else { None }
223 }
224
225 /// Convert a char to its uppercase equivalent
226 ///
227 /// The case-folding performed is the common or simple mapping:
228 /// it maps one unicode codepoint (one char in Rust) to its uppercase equivalent according
229 /// to the Unicode database at ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
230 /// The additional SpecialCasing.txt is not considered here, as it expands to multiple
231 /// codepoints in some cases.
232 ///
233 /// A full reference can be found here
234 /// http://www.unicode.org/versions/Unicode4.0.0/ch03.pdf#G33992
235 ///
236 /// # Return value
237 ///
238 /// Returns the char itself if no conversion was made
239 #[inline]
240 pub fn to_uppercase(c: char) -> char {
241 conversions::to_upper(c)
242 }
243
244 /// Convert a char to its lowercase equivalent
245 ///
246 /// The case-folding performed is the common or simple mapping
247 /// see `to_uppercase` for references and more information
248 ///
249 /// # Return value
250 ///
251 /// Returns the char itself if no conversion if possible
252 #[inline]
253 pub fn to_lowercase(c: char) -> char {
254 conversions::to_lower(c)
255 }
256
257 ///
258 /// Converts a number to the character representing it
259 ///
260 /// # Return value
261 ///
262 /// Returns `Some(char)` if `num` represents one digit under `radix`,
263 /// using one character of `0-9` or `a-z`, or `None` if it doesn't.
264 ///
265 /// # Failure
266 ///
267 /// Fails if given an `radix` > 36.
268 ///
269 #[inline]
270 pub fn from_digit(num: uint, radix: uint) -> Option<char> {
271 if radix > 36 {
272 fail!("from_digit: radix is to high (maximum 36)");
273 }
274 if num < radix {
275 unsafe {
276 if num < 10 {
277 Some(transmute(('0' as uint + num) as u32))
278 } else {
279 Some(transmute(('a' as uint + num - 10u) as u32))
280 }
281 }
282 } else {
283 None
284 }
285 }
286
287 // Constants from Unicode 6.2.0 Section 3.12 Conjoining Jamo Behavior
288 static S_BASE: uint = 0xAC00;
289 static L_BASE: uint = 0x1100;
290 static V_BASE: uint = 0x1161;
291 static T_BASE: uint = 0x11A7;
292 static L_COUNT: uint = 19;
293 static V_COUNT: uint = 21;
294 static T_COUNT: uint = 28;
295 static N_COUNT: uint = (V_COUNT * T_COUNT);
296 static S_COUNT: uint = (L_COUNT * N_COUNT);
297
298 // Decompose a precomposed Hangul syllable
299 fn decompose_hangul(s: char, f: |char|) {
300 let si = s as uint - S_BASE;
301
302 let li = si / N_COUNT;
303 unsafe {
304 f(transmute((L_BASE + li) as u32));
305
306 let vi = (si % N_COUNT) / T_COUNT;
307 f(transmute((V_BASE + vi) as u32));
308
309 let ti = si % T_COUNT;
310 if ti > 0 {
311 f(transmute((T_BASE + ti) as u32));
312 }
313 }
314 }
315
316 /// Returns the canonical decomposition of a character
317 pub fn decompose_canonical(c: char, f: |char|) {
318 if (c as uint) < S_BASE || (c as uint) >= (S_BASE + S_COUNT) {
319 decompose::canonical(c, f);
320 } else {
321 decompose_hangul(c, f);
322 }
323 }
324
325 /// Returns the compatibility decomposition of a character
326 pub fn decompose_compatible(c: char, f: |char|) {
327 if (c as uint) < S_BASE || (c as uint) >= (S_BASE + S_COUNT) {
328 decompose::compatibility(c, f);
329 } else {
330 decompose_hangul(c, f);
331 }
332 }
333
334 ///
335 /// Returns the hexadecimal Unicode escape of a `char`
336 ///
337 /// The rules are as follows:
338 ///
339 /// - chars in [0,0xff] get 2-digit escapes: `\\xNN`
340 /// - chars in [0x100,0xffff] get 4-digit escapes: `\\uNNNN`
341 /// - chars above 0x10000 get 8-digit escapes: `\\UNNNNNNNN`
342 ///
343 pub fn escape_unicode(c: char, f: |char|) {
344 // avoid calling str::to_str_radix because we don't really need to allocate
345 // here.
346 f('\\');
347 let pad = match () {
348 _ if c <= '\xff' => { f('x'); 2 }
349 _ if c <= '\uffff' => { f('u'); 4 }
350 _ => { f('U'); 8 }
351 };
352 for offset in range_step::<i32>(4 * (pad - 1), -1, -4) {
353 unsafe {
354 match ((c as i32) >> offset) & 0xf {
355 i @ 0 .. 9 => { f(transmute('0' as i32 + i)); }
356 i => { f(transmute('a' as i32 + (i - 10))); }
357 }
358 }
359 }
360 }
361
362 ///
363 /// Returns a 'default' ASCII and C++11-like literal escape of a `char`
364 ///
365 /// The default is chosen with a bias toward producing literals that are
366 /// legal in a variety of languages, including C++11 and similar C-family
367 /// languages. The exact rules are:
368 ///
369 /// - Tab, CR and LF are escaped as '\t', '\r' and '\n' respectively.
370 /// - Single-quote, double-quote and backslash chars are backslash-escaped.
371 /// - Any other chars in the range [0x20,0x7e] are not escaped.
372 /// - Any other chars are given hex unicode escapes; see `escape_unicode`.
373 ///
374 pub fn escape_default(c: char, f: |char|) {
375 match c {
376 '\t' => { f('\\'); f('t'); }
377 '\r' => { f('\\'); f('r'); }
378 '\n' => { f('\\'); f('n'); }
379 '\\' => { f('\\'); f('\\'); }
380 '\'' => { f('\\'); f('\''); }
381 '"' => { f('\\'); f('"'); }
382 '\x20' .. '\x7e' => { f(c); }
383 _ => c.escape_unicode(f),
384 }
385 }
386
387 /// Returns the amount of bytes this `char` would need if encoded in UTF-8
388 pub fn len_utf8_bytes(c: char) -> uint {
389 static MAX_ONE_B: uint = 128u;
390 static MAX_TWO_B: uint = 2048u;
391 static MAX_THREE_B: uint = 65536u;
392 static MAX_FOUR_B: uint = 2097152u;
393
394 let code = c as uint;
395 match () {
396 _ if code < MAX_ONE_B => 1u,
397 _ if code < MAX_TWO_B => 2u,
398 _ if code < MAX_THREE_B => 3u,
399 _ if code < MAX_FOUR_B => 4u,
400 _ => fail!("invalid character!"),
401 }
402 }
403
404 /// Useful functions for Unicode characters.
405 pub trait Char {
406 /// Returns whether the specified character is considered a Unicode
407 /// alphabetic code point.
408 fn is_alphabetic(&self) -> bool;
409
410 /// Returns whether the specified character satisfies the 'XID_Start'
411 /// Unicode property.
412 ///
413 /// 'XID_Start' is a Unicode Derived Property specified in
414 /// [UAX #31](http://unicode.org/reports/tr31/#NFKC_Modifications),
415 /// mostly similar to ID_Start but modified for closure under NFKx.
416 fn is_XID_start(&self) -> bool;
417
418 /// Returns whether the specified `char` satisfies the 'XID_Continue'
419 /// Unicode property.
420 ///
421 /// 'XID_Continue' is a Unicode Derived Property specified in
422 /// [UAX #31](http://unicode.org/reports/tr31/#NFKC_Modifications),
423 /// mostly similar to 'ID_Continue' but modified for closure under NFKx.
424 fn is_XID_continue(&self) -> bool;
425
426
427 /// Indicates whether a character is in lowercase.
428 ///
429 /// This is defined according to the terms of the Unicode Derived Core
430 /// Property `Lowercase`.
431 fn is_lowercase(&self) -> bool;
432
433 /// Indicates whether a character is in uppercase.
434 ///
435 /// This is defined according to the terms of the Unicode Derived Core
436 /// Property `Uppercase`.
437 fn is_uppercase(&self) -> bool;
438
439 /// Indicates whether a character is whitespace.
440 ///
441 /// Whitespace is defined in terms of the Unicode Property `White_Space`.
442 fn is_whitespace(&self) -> bool;
443
444 /// Indicates whether a character is alphanumeric.
445 ///
446 /// Alphanumericness is defined in terms of the Unicode General Categories
447 /// 'Nd', 'Nl', 'No' and the Derived Core Property 'Alphabetic'.
448 fn is_alphanumeric(&self) -> bool;
449
450 /// Indicates whether a character is a control code point.
451 ///
452 /// Control code points are defined in terms of the Unicode General
453 /// Category `Cc`.
454 fn is_control(&self) -> bool;
455
456 /// Indicates whether the character is numeric (Nd, Nl, or No).
457 fn is_digit(&self) -> bool;
458
459 /// Checks if a `char` parses as a numeric digit in the given radix.
460 ///
461 /// Compared to `is_digit()`, this function only recognizes the characters
462 /// `0-9`, `a-z` and `A-Z`.
463 ///
464 /// # Return value
465 ///
466 /// Returns `true` if `c` is a valid digit under `radix`, and `false`
467 /// otherwise.
468 ///
469 /// # Failure
470 ///
471 /// Fails if given a radix > 36.
472 fn is_digit_radix(&self, radix: uint) -> bool;
473
474 /// Converts a character to the corresponding digit.
475 ///
476 /// # Return value
477 ///
478 /// If `c` is between '0' and '9', the corresponding value between 0 and
479 /// 9. If `c` is 'a' or 'A', 10. If `c` is 'b' or 'B', 11, etc. Returns
480 /// none if the character does not refer to a digit in the given radix.
481 ///
482 /// # Failure
483 ///
484 /// Fails if given a radix outside the range [0..36].
485 fn to_digit(&self, radix: uint) -> Option<uint>;
486
487 /// Converts a character to its lowercase equivalent.
488 ///
489 /// The case-folding performed is the common or simple mapping. See
490 /// `to_uppercase()` for references and more information.
491 ///
492 /// # Return value
493 ///
494 /// Returns the lowercase equivalent of the character, or the character
495 /// itself if no conversion is possible.
496 fn to_lowercase(&self) -> char;
497
498 /// Converts a character to its uppercase equivalent.
499 ///
500 /// The case-folding performed is the common or simple mapping: it maps
501 /// one unicode codepoint (one character in Rust) to its uppercase
502 /// equivalent according to the Unicode database [1]. The additional
503 /// `SpecialCasing.txt` is not considered here, as it expands to multiple
504 /// codepoints in some cases.
505 ///
506 /// A full reference can be found here [2].
507 ///
508 /// # Return value
509 ///
510 /// Returns the uppercase equivalent of the character, or the character
511 /// itself if no conversion was made.
512 ///
513 /// [1]: ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
514 ///
515 /// [2]: http://www.unicode.org/versions/Unicode4.0.0/ch03.pdf#G33992
516 fn to_uppercase(&self) -> char;
517
518 /// Converts a number to the character representing it.
519 ///
520 /// # Return value
521 ///
522 /// Returns `Some(char)` if `num` represents one digit under `radix`,
523 /// using one character of `0-9` or `a-z`, or `None` if it doesn't.
524 ///
525 /// # Failure
526 ///
527 /// Fails if given a radix > 36.
528 fn from_digit(num: uint, radix: uint) -> Option<char>;
529
530 /// Returns the hexadecimal Unicode escape of a character.
531 ///
532 /// The rules are as follows:
533 ///
534 /// * Characters in [0,0xff] get 2-digit escapes: `\\xNN`
535 /// * Characters in [0x100,0xffff] get 4-digit escapes: `\\uNNNN`.
536 /// * Characters above 0x10000 get 8-digit escapes: `\\UNNNNNNNN`.
537 fn escape_unicode(&self, f: |char|);
538
539 /// Returns a 'default' ASCII and C++11-like literal escape of a
540 /// character.
541 ///
542 /// The default is chosen with a bias toward producing literals that are
543 /// legal in a variety of languages, including C++11 and similar C-family
544 /// languages. The exact rules are:
545 ///
546 /// * Tab, CR and LF are escaped as '\t', '\r' and '\n' respectively.
547 /// * Single-quote, double-quote and backslash chars are backslash-
548 /// escaped.
549 /// * Any other chars in the range [0x20,0x7e] are not escaped.
550 /// * Any other chars are given hex unicode escapes; see `escape_unicode`.
551 fn escape_default(&self, f: |char|);
552
553 /// Returns the amount of bytes this character would need if encoded in
554 /// UTF-8.
555 fn len_utf8_bytes(&self) -> uint;
556
557 /// Encodes this character as UTF-8 into the provided byte buffer.
558 ///
559 /// The buffer must be at least 4 bytes long or a runtime failure may
560 /// occur.
561 ///
562 /// This will then return the number of bytes written to the slice.
563 fn encode_utf8(&self, dst: &mut [u8]) -> uint;
564
565 /// Encodes this character as UTF-16 into the provided `u16` buffer.
566 ///
567 /// The buffer must be at least 2 elements long or a runtime failure may
568 /// occur.
569 ///
570 /// This will then return the number of `u16`s written to the slice.
571 fn encode_utf16(&self, dst: &mut [u16]) -> uint;
572 }
573
574 impl Char for char {
575 fn is_alphabetic(&self) -> bool { is_alphabetic(*self) }
576
577 fn is_XID_start(&self) -> bool { is_XID_start(*self) }
578
579 fn is_XID_continue(&self) -> bool { is_XID_continue(*self) }
580
581 fn is_lowercase(&self) -> bool { is_lowercase(*self) }
582
583 fn is_uppercase(&self) -> bool { is_uppercase(*self) }
584
585 fn is_whitespace(&self) -> bool { is_whitespace(*self) }
586
587 fn is_alphanumeric(&self) -> bool { is_alphanumeric(*self) }
588
589 fn is_control(&self) -> bool { is_control(*self) }
590
591 fn is_digit(&self) -> bool { is_digit(*self) }
592
593 fn is_digit_radix(&self, radix: uint) -> bool { is_digit_radix(*self, radix) }
594
595 fn to_digit(&self, radix: uint) -> Option<uint> { to_digit(*self, radix) }
596
597 fn to_lowercase(&self) -> char { to_lowercase(*self) }
598
599 fn to_uppercase(&self) -> char { to_uppercase(*self) }
600
601 fn from_digit(num: uint, radix: uint) -> Option<char> { from_digit(num, radix) }
602
603 fn escape_unicode(&self, f: |char|) { escape_unicode(*self, f) }
604
605 fn escape_default(&self, f: |char|) { escape_default(*self, f) }
606
607 fn len_utf8_bytes(&self) -> uint { len_utf8_bytes(*self) }
608
609 fn encode_utf8(&self, dst: &mut [u8]) -> uint {
610 let code = *self as uint;
611 if code < MAX_ONE_B {
612 dst[0] = code as u8;
613 return 1;
614 } else if code < MAX_TWO_B {
615 dst[0] = (code >> 6u & 31u | TAG_TWO_B) as u8;
616 dst[1] = (code & 63u | TAG_CONT) as u8;
617 return 2;
618 } else if code < MAX_THREE_B {
619 dst[0] = (code >> 12u & 15u | TAG_THREE_B) as u8;
620 dst[1] = (code >> 6u & 63u | TAG_CONT) as u8;
621 dst[2] = (code & 63u | TAG_CONT) as u8;
622 return 3;
623 } else {
624 dst[0] = (code >> 18u & 7u | TAG_FOUR_B) as u8;
625 dst[1] = (code >> 12u & 63u | TAG_CONT) as u8;
626 dst[2] = (code >> 6u & 63u | TAG_CONT) as u8;
627 dst[3] = (code & 63u | TAG_CONT) as u8;
628 return 4;
629 }
630 }
631
632 fn encode_utf16(&self, dst: &mut [u16]) -> uint {
633 let mut ch = *self as uint;
634 if (ch & 0xFFFF_u) == ch {
635 // The BMP falls through (assuming non-surrogate, as it
636 // should)
637 assert!(ch <= 0xD7FF_u || ch >= 0xE000_u);
638 dst[0] = ch as u16;
639 1
640 } else {
641 // Supplementary planes break into surrogates.
642 assert!(ch >= 0x1_0000_u && ch <= 0x10_FFFF_u);
643 ch -= 0x1_0000_u;
644 dst[0] = 0xD800_u16 | ((ch >> 10) as u16);
645 dst[1] = 0xDC00_u16 | ((ch as u16) & 0x3FF_u16);
646 2
647 }
648 }
649 }
650
651 #[cfg(not(test))]
652 impl Eq for char {
653 #[inline]
654 fn eq(&self, other: &char) -> bool { (*self) == (*other) }
655 }
656
657 #[cfg(not(test))]
658 impl TotalEq for char {}
659
660 #[cfg(not(test))]
661 impl Ord for char {
662 #[inline]
663 fn lt(&self, other: &char) -> bool { *self < *other }
664 }
665
666 #[cfg(not(test))]
667 impl TotalOrd for char {
668 fn cmp(&self, other: &char) -> Ordering {
669 (*self as u32).cmp(&(*other as u32))
670 }
671 }
672
673 #[cfg(not(test))]
674 impl Default for char {
675 #[inline]
676 fn default() -> char { '\x00' }
677 }
678
679 #[cfg(test)]
680 mod test {
681 use super::{escape_unicode, escape_default};
682
683 use realcore::char::Char;
684 use slice::ImmutableVector;
685 use realstd::option::{Some, None};
686 use realstd::strbuf::StrBuf;
687 use realstd::str::StrAllocating;
688
689 #[test]
690 fn test_is_lowercase() {
691 assert!('a'.is_lowercase());
692 assert!('ö'.is_lowercase());
693 assert!('Ã'.is_lowercase());
694 assert!(!'Ã'.is_lowercase());
695 assert!(!'P'.is_lowercase());
696 }
697
698 #[test]
699 fn test_is_uppercase() {
700 assert!(!'h'.is_uppercase());
701 assert!(!'ä'.is_uppercase());
702 assert!(!'Ã'.is_uppercase());
703 assert!('Ã'.is_uppercase());
704 assert!('T'.is_uppercase());
705 }
706
707 #[test]
708 fn test_is_whitespace() {
709 assert!(' '.is_whitespace());
710 assert!('\u2007'.is_whitespace());
711 assert!('\t'.is_whitespace());
712 assert!('\n'.is_whitespace());
713 assert!(!'a'.is_whitespace());
714 assert!(!'_'.is_whitespace());
715 assert!(!'\u0000'.is_whitespace());
716 }
717
718 #[test]
719 fn test_to_digit() {
720 assert_eq!('0'.to_digit(10u), Some(0u));
721 assert_eq!('1'.to_digit(2u), Some(1u));
722 assert_eq!('2'.to_digit(3u), Some(2u));
723 assert_eq!('9'.to_digit(10u), Some(9u));
724 assert_eq!('a'.to_digit(16u), Some(10u));
725 assert_eq!('A'.to_digit(16u), Some(10u));
726 assert_eq!('b'.to_digit(16u), Some(11u));
727 assert_eq!('B'.to_digit(16u), Some(11u));
728 assert_eq!('z'.to_digit(36u), Some(35u));
729 assert_eq!('Z'.to_digit(36u), Some(35u));
730 assert_eq!(' '.to_digit(10u), None);
731 assert_eq!('$'.to_digit(36u), None);
732 }
733
734 #[test]
735 fn test_to_lowercase() {
736 assert_eq!('A'.to_lowercase(), 'a');
737 assert_eq!('Ã'.to_lowercase(), 'ö');
738 assert_eq!('Ã'.to_lowercase(), 'Ã');
739 assert_eq!('Ã'.to_lowercase(), 'ü');
740 assert_eq!('ð©'.to_lowercase(), 'ð©');
741 assert_eq!('Σ'.to_lowercase(), 'Ï');
742 assert_eq!('Τ'.to_lowercase(), 'Ï');
743 assert_eq!('Î'.to_lowercase(), 'ι');
744 assert_eq!('Î'.to_lowercase(), 'γ');
745 assert_eq!('Î'.to_lowercase(), 'μ');
746 assert_eq!('Î'.to_lowercase(), 'α');
747 assert_eq!('Σ'.to_lowercase(), 'Ï');
748 }
749
750 #[test]
751 fn test_to_uppercase() {
752 assert_eq!('a'.to_uppercase(), 'A');
753 assert_eq!('ö'.to_uppercase(), 'Ã');
754 assert_eq!('Ã'.to_uppercase(), 'Ã'); // not áº: Latin capital letter sharp s
755 assert_eq!('ü'.to_uppercase(), 'Ã');
756 assert_eq!('ð©'.to_uppercase(), 'ð©');
757
758 assert_eq!('Ï'.to_uppercase(), 'Σ');
759 assert_eq!('Ï'.to_uppercase(), 'Τ');
760 assert_eq!('ι'.to_uppercase(), 'Î');
761 assert_eq!('γ'.to_uppercase(), 'Î');
762 assert_eq!('μ'.to_uppercase(), 'Î');
763 assert_eq!('α'.to_uppercase(), 'Î');
764 assert_eq!('Ï'.to_uppercase(), 'Σ');
765 }
766
767 #[test]
768 fn test_is_control() {
769 assert!('\u0000'.is_control());
770 assert!('\u0003'.is_control());
771 assert!('\u0006'.is_control());
772 assert!('\u0009'.is_control());
773 assert!('\u007f'.is_control());
774 assert!('\u0092'.is_control());
775 assert!(!'\u0020'.is_control());
776 assert!(!'\u0055'.is_control());
777 assert!(!'\u0068'.is_control());
778 }
779
780 #[test]
781 fn test_is_digit() {
782 assert!('2'.is_digit());
783 assert!('7'.is_digit());
784 assert!(!'c'.is_digit());
785 assert!(!'i'.is_digit());
786 assert!(!'z'.is_digit());
787 assert!(!'Q'.is_digit());
788 }
789
790 #[test]
791 fn test_escape_default() {
792 fn string(c: char) -> ~str {
793 let mut result = StrBuf::new();
794 escape_default(c, |c| { result.push_char(c); });
795 return result.into_owned();
796 }
797 assert_eq!(string('\n'), "\\n".to_owned());
798 assert_eq!(string('\r'), "\\r".to_owned());
799 assert_eq!(string('\''), "\\'".to_owned());
800 assert_eq!(string('"'), "\\\"".to_owned());
801 assert_eq!(string(' '), " ".to_owned());
802 assert_eq!(string('a'), "a".to_owned());
803 assert_eq!(string('~'), "~".to_owned());
804 assert_eq!(string('\x00'), "\\x00".to_owned());
805 assert_eq!(string('\x1f'), "\\x1f".to_owned());
806 assert_eq!(string('\x7f'), "\\x7f".to_owned());
807 assert_eq!(string('\xff'), "\\xff".to_owned());
808 assert_eq!(string('\u011b'), "\\u011b".to_owned());
809 assert_eq!(string('\U0001d4b6'), "\\U0001d4b6".to_owned());
810 }
811
812 #[test]
813 fn test_escape_unicode() {
814 fn string(c: char) -> ~str {
815 let mut result = StrBuf::new();
816 escape_unicode(c, |c| { result.push_char(c); });
817 return result.into_owned();
818 }
819 assert_eq!(string('\x00'), "\\x00".to_owned());
820 assert_eq!(string('\n'), "\\x0a".to_owned());
821 assert_eq!(string(' '), "\\x20".to_owned());
822 assert_eq!(string('a'), "\\x61".to_owned());
823 assert_eq!(string('\u011b'), "\\u011b".to_owned());
824 assert_eq!(string('\U0001d4b6'), "\\U0001d4b6".to_owned());
825 }
826
827 #[test]
828 fn test_to_str() {
829 use realstd::to_str::ToStr;
830 let s = 't'.to_str();
831 assert_eq!(s, "t".to_owned());
832 }
833
834 #[test]
835 fn test_encode_utf8() {
836 fn check(input: char, expect: &[u8]) {
837 let mut buf = [0u8, ..4];
838 let n = input.encode_utf8(buf /* as mut slice! */);
839 assert_eq!(buf.slice_to(n), expect);
840 }
841
842 check('x', [0x78]);
843 check('\u00e9', [0xc3, 0xa9]);
844 check('\ua66e', [0xea, 0x99, 0xae]);
845 check('\U0001f4a9', [0xf0, 0x9f, 0x92, 0xa9]);
846 }
847
848 #[test]
849 fn test_encode_utf16() {
850 fn check(input: char, expect: &[u16]) {
851 let mut buf = [0u16, ..2];
852 let n = input.encode_utf16(buf /* as mut slice! */);
853 assert_eq!(buf.slice_to(n), expect);
854 }
855
856 check('x', [0x0078]);
857 check('\u00e9', [0x00e9]);
858 check('\ua66e', [0xa66e]);
859 check('\U0001f4a9', [0xd83d, 0xdca9]);
860 }
861 }