(index<- )        ./libcore/char.rs

    git branch:    * master           5200215 auto merge of #14035 : alexcrichton/rust/experimental, r=huonw
    modified:    Fri May  9 13:02:28 2014

   1  // Copyright 2012-2013 The Rust Project Developers. See the COPYRIGHT
   2  // file at the top-level directory of this distribution and at
   3  // http://rust-lang.org/COPYRIGHT.
   4  //
   5  // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
   6  // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
   7  // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
   8  // option. This file may not be copied, modified, or distributed
   9  // except according to those terms.
  10
  11  //! Character manipulation (`char` type, Unicode Scalar Value)
  12  //!
  13  //! This module  provides the `Char` trait, as well as its implementation
  14  //! for the primitive `char` type, in order to allow basic character manipulation.
  15  //!
  16  //! A `char` actually represents a
  17  //! *[Unicode Scalar Value](http://www.unicode.org/glossary/#unicode_scalar_value)*,
  18  //! as it can contain any Unicode code point except high-surrogate and
  19  //! low-surrogate code points.
  20  //!
  21  //! As such, only values in the ranges \[0x0,0xD7FF\] and \[0xE000,0x10FFFF\]
  22  //! (inclusive) are allowed. A `char` can always be safely cast to a `u32`;
  23  //! however the converse is not always true due to the above range limits
  24  //! and, as such, should be performed via the `from_u32` function..
  25
  26
  27  use cast::transmute;
  28  use option::{None, Option, Some};
  29  use iter::{Iterator, range_step};
  30  use unicode::{derived_property, property, general_category, decompose, conversions};
  31
  32  #[cfg(not(test))] use cmp::{Eq, Ord, TotalEq, TotalOrd, Ordering};
  33  #[cfg(not(test))] use default::Default;
  34
  35  // UTF-8 ranges and tags for encoding characters
  36  static TAG_CONT: uint = 128u;
  37  static MAX_ONE_B: uint = 128u;
  38  static TAG_TWO_B: uint = 192u;
  39  static MAX_TWO_B: uint = 2048u;
  40  static TAG_THREE_B: uint = 224u;
  41  static MAX_THREE_B: uint = 65536u;
  42  static TAG_FOUR_B: uint = 240u;
  43
  44  /*
  45      Lu  Uppercase_Letter        an uppercase letter
  46      Ll  Lowercase_Letter        a lowercase letter
  47      Lt  Titlecase_Letter        a digraphic character, with first part uppercase
  48      Lm  Modifier_Letter         a modifier letter
  49      Lo  Other_Letter            other letters, including syllables and ideographs
  50      Mn  Nonspacing_Mark         a nonspacing combining mark (zero advance width)
  51      Mc  Spacing_Mark            a spacing combining mark (positive advance width)
  52      Me  Enclosing_Mark          an enclosing combining mark
  53      Nd  Decimal_Number          a decimal digit
  54      Nl  Letter_Number           a letterlike numeric character
  55      No  Other_Number            a numeric character of other type
  56      Pc  Connector_Punctuation   a connecting punctuation mark, like a tie
  57      Pd  Dash_Punctuation        a dash or hyphen punctuation mark
  58      Ps  Open_Punctuation        an opening punctuation mark (of a pair)
  59      Pe  Close_Punctuation       a closing punctuation mark (of a pair)
  60      Pi  Initial_Punctuation     an initial quotation mark
  61      Pf  Final_Punctuation       a final quotation mark
  62      Po  Other_Punctuation       a punctuation mark of other type
  63      Sm  Math_Symbol             a symbol of primarily mathematical use
  64      Sc  Currency_Symbol         a currency sign
  65      Sk  Modifier_Symbol         a non-letterlike modifier symbol
  66      So  Other_Symbol            a symbol of other type
  67      Zs  Space_Separator         a space character (of various non-zero widths)
  68      Zl  Line_Separator          U+2028 LINE SEPARATOR only
  69      Zp  Paragraph_Separator     U+2029 PARAGRAPH SEPARATOR only
  70      Cc  Control                 a C0 or C1 control code
  71      Cf  Format                  a format control character
  72      Cs  Surrogate               a surrogate code point
  73      Co  Private_Use             a private-use character
  74      Cn  Unassigned              a reserved unassigned code point or a noncharacter
  75  */
  76
  77  /// The highest valid code point
  78  pub static MAX: char = '\U0010ffff';
  79
  80  /// Converts from `u32` to a `char`
  81  #[inline]
  82  pub fn from_u32(i: u32) -> Option<char> {
  83      // catch out-of-bounds and surrogates
  84      if (i > MAX as u32) || (i >= 0xD800 && i <= 0xDFFF) {
  85          None
  86      } else {
  87          Some(unsafe { transmute(i) })
  88      }
  89  }
  90
  91  /// Returns whether the specified `char` is considered a Unicode alphabetic
  92  /// code point
  93  pub fn is_alphabetic(c: char) -> bool   { derived_property::Alphabetic(c) }
  94
  95  /// Returns whether the specified `char` satisfies the 'XID_Start' Unicode property
  96  ///
  97  /// 'XID_Start' is a Unicode Derived Property specified in
  98  /// [UAX #31](http://unicode.org/reports/tr31/#NFKC_Modifications),
  99  /// mostly similar to ID_Start but modified for closure under NFKx.
100  pub fn is_XID_start(c: char) -> bool    { derived_property::XID_Start(c) }
101
102  /// Returns whether the specified `char` satisfies the 'XID_Continue' Unicode property
103  ///
104  /// 'XID_Continue' is a Unicode Derived Property specified in
105  /// [UAX #31](http://unicode.org/reports/tr31/#NFKC_Modifications),
106  /// mostly similar to 'ID_Continue' but modified for closure under NFKx.
107  pub fn is_XID_continue(c: char) -> bool { derived_property::XID_Continue(c) }
108
109  ///
110  /// Indicates whether a `char` is in lower case
111  ///
112  /// This is defined according to the terms of the Unicode Derived Core Property 'Lowercase'.
113  ///
114  #[inline]
115  pub fn is_lowercase(c: char) -> bool { derived_property::Lowercase(c) }
116
117  ///
118  /// Indicates whether a `char` is in upper case
119  ///
120  /// This is defined according to the terms of the Unicode Derived Core Property 'Uppercase'.
121  ///
122  #[inline]
123  pub fn is_uppercase(c: char) -> bool { derived_property::Uppercase(c) }
124
125  ///
126  /// Indicates whether a `char` is whitespace
127  ///
128  /// Whitespace is defined in terms of the Unicode Property 'White_Space'.
129  ///
130  #[inline]
131  pub fn is_whitespace(c: char) -> bool {
132      // As an optimization ASCII whitespace characters are checked separately
133      c == ' '
134          || ('\x09' <= c && c <= '\x0d')
135          || property::White_Space(c)
136  }
137
138  ///
139  /// Indicates whether a `char` is alphanumeric
140  ///
141  /// Alphanumericness is defined in terms of the Unicode General Categories
142  /// 'Nd', 'Nl', 'No' and the Derived Core Property 'Alphabetic'.
143  ///
144  #[inline]
145  pub fn is_alphanumeric(c: char) -> bool {
146      derived_property::Alphabetic(c)
147          || general_category::Nd(c)
148          || general_category::Nl(c)
149          || general_category::No(c)
150  }
151
152  ///
153  /// Indicates whether a `char` is a control code point
154  ///
155  /// Control code points are defined in terms of the Unicode General Category
156  /// 'Cc'.
157  ///
158  #[inline]
159  pub fn is_control(c: char) -> bool { general_category::Cc(c) }
160
161  /// Indicates whether the `char` is numeric (Nd, Nl, or No)
162  #[inline]
163  pub fn is_digit(c: char) -> bool {
164      general_category::Nd(c)
165          || general_category::Nl(c)
166          || general_category::No(c)
167  }
168
169  ///
170  /// Checks if a `char` parses as a numeric digit in the given radix
171  ///
172  /// Compared to `is_digit()`, this function only recognizes the
173  /// characters `0-9`, `a-z` and `A-Z`.
174  ///
175  /// # Return value
176  ///
177  /// Returns `true` if `c` is a valid digit under `radix`, and `false`
178  /// otherwise.
179  ///
180  /// # Failure
181  ///
182  /// Fails if given a `radix` > 36.
183  ///
184  /// # Note
185  ///
186  /// This just wraps `to_digit()`.
187  ///
188  #[inline]
189  pub fn is_digit_radix(c: char, radix: uint) -> bool {
190      match to_digit(c, radix) {
191          Some(_) => true,
192          None    => false,
193      }
194  }
195
196  ///
197  /// Converts a `char` to the corresponding digit
198  ///
199  /// # Return value
200  ///
201  /// If `c` is between '0' and '9', the corresponding value
202  /// between 0 and 9. If `c` is 'a' or 'A', 10. If `c` is
203  /// 'b' or 'B', 11, etc. Returns none if the `char` does not
204  /// refer to a digit in the given radix.
205  ///
206  /// # Failure
207  ///
208  /// Fails if given a `radix` outside the range `[0..36]`.
209  ///
210  #[inline]
211  pub fn to_digit(c: char, radix: uint) -> Option<uint> {
212      if radix > 36 {
213          fail!("to_digit: radix is too high (maximum 36)");
214      }
215      let val = match c {
216        '0' .. '9' => c as uint - ('0' as uint),
217        'a' .. 'z' => c as uint + 10u - ('a' as uint),
218        'A' .. 'Z' => c as uint + 10u - ('A' as uint),
219        _ => return None,
220      };
221      if val < radix { Some(val) }
222      else { None }
223  }
224
225  /// Convert a char to its uppercase equivalent
226  ///
227  /// The case-folding performed is the common or simple mapping:
228  /// it maps one unicode codepoint (one char in Rust) to its uppercase equivalent according
229  /// to the Unicode database at ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
230  /// The additional SpecialCasing.txt is not considered here, as it expands to multiple
231  /// codepoints in some cases.
232  ///
233  /// A full reference can be found here
234  /// http://www.unicode.org/versions/Unicode4.0.0/ch03.pdf#G33992
235  ///
236  /// # Return value
237  ///
238  /// Returns the char itself if no conversion was made
239  #[inline]
240  pub fn to_uppercase(c: char) -> char {
241      conversions::to_upper(c)
242  }
243
244  /// Convert a char to its lowercase equivalent
245  ///
246  /// The case-folding performed is the common or simple mapping
247  /// see `to_uppercase` for references and more information
248  ///
249  /// # Return value
250  ///
251  /// Returns the char itself if no conversion if possible
252  #[inline]
253  pub fn to_lowercase(c: char) -> char {
254      conversions::to_lower(c)
255  }
256
257  ///
258  /// Converts a number to the character representing it
259  ///
260  /// # Return value
261  ///
262  /// Returns `Some(char)` if `num` represents one digit under `radix`,
263  /// using one character of `0-9` or `a-z`, or `None` if it doesn't.
264  ///
265  /// # Failure
266  ///
267  /// Fails if given an `radix` > 36.
268  ///
269  #[inline]
270  pub fn from_digit(num: uint, radix: uint) -> Option<char> {
271      if radix > 36 {
272          fail!("from_digit: radix is to high (maximum 36)");
273      }
274      if num < radix {
275          unsafe {
276              if num < 10 {
277                  Some(transmute(('0' as uint + num) as u32))
278              } else {
279                  Some(transmute(('a' as uint + num - 10u) as u32))
280              }
281          }
282      } else {
283          None
284      }
285  }
286
287  // Constants from Unicode 6.2.0 Section 3.12 Conjoining Jamo Behavior
288  static S_BASE: uint = 0xAC00;
289  static L_BASE: uint = 0x1100;
290  static V_BASE: uint = 0x1161;
291  static T_BASE: uint = 0x11A7;
292  static L_COUNT: uint = 19;
293  static V_COUNT: uint = 21;
294  static T_COUNT: uint = 28;
295  static N_COUNT: uint = (V_COUNT * T_COUNT);
296  static S_COUNT: uint = (L_COUNT * N_COUNT);
297
298  // Decompose a precomposed Hangul syllable
299  fn decompose_hangul(s: char, f: |char|) {
300      let si = s as uint - S_BASE;
301
302      let li = si / N_COUNT;
303      unsafe {
304          f(transmute((L_BASE + li) as u32));
305
306          let vi = (si % N_COUNT) / T_COUNT;
307          f(transmute((V_BASE + vi) as u32));
308
309          let ti = si % T_COUNT;
310          if ti > 0 {
311              f(transmute((T_BASE + ti) as u32));
312          }
313      }
314  }
315
316  /// Returns the canonical decomposition of a character
317  pub fn decompose_canonical(c: char, f: |char|) {
318      if (c as uint) < S_BASE || (c as uint) >= (S_BASE + S_COUNT) {
319          decompose::canonical(c, f);
320      } else {
321          decompose_hangul(c, f);
322      }
323  }
324
325  /// Returns the compatibility decomposition of a character
326  pub fn decompose_compatible(c: char, f: |char|) {
327      if (c as uint) < S_BASE || (c as uint) >= (S_BASE + S_COUNT) {
328          decompose::compatibility(c, f);
329      } else {
330          decompose_hangul(c, f);
331      }
332  }
333
334  ///
335  /// Returns the hexadecimal Unicode escape of a `char`
336  ///
337  /// The rules are as follows:
338  ///
339  /// - chars in [0,0xff] get 2-digit escapes: `\\xNN`
340  /// - chars in [0x100,0xffff] get 4-digit escapes: `\\uNNNN`
341  /// - chars above 0x10000 get 8-digit escapes: `\\UNNNNNNNN`
342  ///
343  pub fn escape_unicode(c: char, f: |char|) {
344      // avoid calling str::to_str_radix because we don't really need to allocate
345      // here.
346      f('\\');
347      let pad = match () {
348          _ if c <= '\xff'    => { f('x'); 2 }
349          _ if c <= '\uffff'  => { f('u'); 4 }
350          _                   => { f('U'); 8 }
351      };
352      for offset in range_step::<i32>(4 * (pad - 1), -1, -4) {
353          unsafe {
354              match ((c as i32) >> offset) & 0xf {
355                  i @ 0 .. 9 => { f(transmute('0' as i32 + i)); }
356                  i => { f(transmute('a' as i32 + (i - 10))); }
357              }
358          }
359      }
360  }
361
362  ///
363  /// Returns a 'default' ASCII and C++11-like literal escape of a `char`
364  ///
365  /// The default is chosen with a bias toward producing literals that are
366  /// legal in a variety of languages, including C++11 and similar C-family
367  /// languages. The exact rules are:
368  ///
369  /// - Tab, CR and LF are escaped as '\t', '\r' and '\n' respectively.
370  /// - Single-quote, double-quote and backslash chars are backslash-escaped.
371  /// - Any other chars in the range [0x20,0x7e] are not escaped.
372  /// - Any other chars are given hex unicode escapes; see `escape_unicode`.
373  ///
374  pub fn escape_default(c: char, f: |char|) {
375      match c {
376          '\t' => { f('\\'); f('t'); }
377          '\r' => { f('\\'); f('r'); }
378          '\n' => { f('\\'); f('n'); }
379          '\\' => { f('\\'); f('\\'); }
380          '\'' => { f('\\'); f('\''); }
381          '"'  => { f('\\'); f('"'); }
382          '\x20' .. '\x7e' => { f(c); }
383          _ => c.escape_unicode(f),
384      }
385  }
386
387  /// Returns the amount of bytes this `char` would need if encoded in UTF-8
388  pub fn len_utf8_bytes(c: char) -> uint {
389      static MAX_ONE_B:   uint = 128u;
390      static MAX_TWO_B:   uint = 2048u;
391      static MAX_THREE_B: uint = 65536u;
392      static MAX_FOUR_B:  uint = 2097152u;
393
394      let code = c as uint;
395      match () {
396          _ if code < MAX_ONE_B   => 1u,
397          _ if code < MAX_TWO_B   => 2u,
398          _ if code < MAX_THREE_B => 3u,
399          _ if code < MAX_FOUR_B  => 4u,
400          _                       => fail!("invalid character!"),
401      }
402  }
403
404  /// Useful functions for Unicode characters.
405  pub trait Char {
406      /// Returns whether the specified character is considered a Unicode
407      /// alphabetic code point.
408      fn is_alphabetic(&self) -> bool;
409
410      /// Returns whether the specified character satisfies the 'XID_Start'
411      /// Unicode property.
412      ///
413      /// 'XID_Start' is a Unicode Derived Property specified in
414      /// [UAX #31](http://unicode.org/reports/tr31/#NFKC_Modifications),
415      /// mostly similar to ID_Start but modified for closure under NFKx.
416      fn is_XID_start(&self) -> bool;
417
418      /// Returns whether the specified `char` satisfies the 'XID_Continue'
419      /// Unicode property.
420      ///
421      /// 'XID_Continue' is a Unicode Derived Property specified in
422      /// [UAX #31](http://unicode.org/reports/tr31/#NFKC_Modifications),
423      /// mostly similar to 'ID_Continue' but modified for closure under NFKx.
424      fn is_XID_continue(&self) -> bool;
425
426
427      /// Indicates whether a character is in lowercase.
428      ///
429      /// This is defined according to the terms of the Unicode Derived Core
430      /// Property `Lowercase`.
431      fn is_lowercase(&self) -> bool;
432
433      /// Indicates whether a character is in uppercase.
434      ///
435      /// This is defined according to the terms of the Unicode Derived Core
436      /// Property `Uppercase`.
437      fn is_uppercase(&self) -> bool;
438
439      /// Indicates whether a character is whitespace.
440      ///
441      /// Whitespace is defined in terms of the Unicode Property `White_Space`.
442      fn is_whitespace(&self) -> bool;
443
444      /// Indicates whether a character is alphanumeric.
445      ///
446      /// Alphanumericness is defined in terms of the Unicode General Categories
447      /// 'Nd', 'Nl', 'No' and the Derived Core Property 'Alphabetic'.
448      fn is_alphanumeric(&self) -> bool;
449
450      /// Indicates whether a character is a control code point.
451      ///
452      /// Control code points are defined in terms of the Unicode General
453      /// Category `Cc`.
454      fn is_control(&self) -> bool;
455
456      /// Indicates whether the character is numeric (Nd, Nl, or No).
457      fn is_digit(&self) -> bool;
458
459      /// Checks if a `char` parses as a numeric digit in the given radix.
460      ///
461      /// Compared to `is_digit()`, this function only recognizes the characters
462      /// `0-9`, `a-z` and `A-Z`.
463      ///
464      /// # Return value
465      ///
466      /// Returns `true` if `c` is a valid digit under `radix`, and `false`
467      /// otherwise.
468      ///
469      /// # Failure
470      ///
471      /// Fails if given a radix > 36.
472      fn is_digit_radix(&self, radix: uint) -> bool;
473
474      /// Converts a character to the corresponding digit.
475      ///
476      /// # Return value
477      ///
478      /// If `c` is between '0' and '9', the corresponding value between 0 and
479      /// 9. If `c` is 'a' or 'A', 10. If `c` is 'b' or 'B', 11, etc. Returns
480      /// none if the character does not refer to a digit in the given radix.
481      ///
482      /// # Failure
483      ///
484      /// Fails if given a radix outside the range [0..36].
485      fn to_digit(&self, radix: uint) -> Option<uint>;
486
487      /// Converts a character to its lowercase equivalent.
488      ///
489      /// The case-folding performed is the common or simple mapping. See
490      /// `to_uppercase()` for references and more information.
491      ///
492      /// # Return value
493      ///
494      /// Returns the lowercase equivalent of the character, or the character
495      /// itself if no conversion is possible.
496      fn to_lowercase(&self) -> char;
497
498      /// Converts a character to its uppercase equivalent.
499      ///
500      /// The case-folding performed is the common or simple mapping: it maps
501      /// one unicode codepoint (one character in Rust) to its uppercase
502      /// equivalent according to the Unicode database [1]. The additional
503      /// `SpecialCasing.txt` is not considered here, as it expands to multiple
504      /// codepoints in some cases.
505      ///
506      /// A full reference can be found here [2].
507      ///
508      /// # Return value
509      ///
510      /// Returns the uppercase equivalent of the character, or the character
511      /// itself if no conversion was made.
512      ///
513      /// [1]: ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
514      ///
515      /// [2]: http://www.unicode.org/versions/Unicode4.0.0/ch03.pdf#G33992
516      fn to_uppercase(&self) -> char;
517
518      /// Converts a number to the character representing it.
519      ///
520      /// # Return value
521      ///
522      /// Returns `Some(char)` if `num` represents one digit under `radix`,
523      /// using one character of `0-9` or `a-z`, or `None` if it doesn't.
524      ///
525      /// # Failure
526      ///
527      /// Fails if given a radix > 36.
528      fn from_digit(num: uint, radix: uint) -> Option<char>;
529
530      /// Returns the hexadecimal Unicode escape of a character.
531      ///
532      /// The rules are as follows:
533      ///
534      /// * Characters in [0,0xff] get 2-digit escapes: `\\xNN`
535      /// * Characters in [0x100,0xffff] get 4-digit escapes: `\\uNNNN`.
536      /// * Characters above 0x10000 get 8-digit escapes: `\\UNNNNNNNN`.
537      fn escape_unicode(&self, f: |char|);
538
539      /// Returns a 'default' ASCII and C++11-like literal escape of a
540      /// character.
541      ///
542      /// The default is chosen with a bias toward producing literals that are
543      /// legal in a variety of languages, including C++11 and similar C-family
544      /// languages. The exact rules are:
545      ///
546      /// * Tab, CR and LF are escaped as '\t', '\r' and '\n' respectively.
547      /// * Single-quote, double-quote and backslash chars are backslash-
548      ///   escaped.
549      /// * Any other chars in the range [0x20,0x7e] are not escaped.
550      /// * Any other chars are given hex unicode escapes; see `escape_unicode`.
551      fn escape_default(&self, f: |char|);
552
553      /// Returns the amount of bytes this character would need if encoded in
554      /// UTF-8.
555      fn len_utf8_bytes(&self) -> uint;
556
557      /// Encodes this character as UTF-8 into the provided byte buffer.
558      ///
559      /// The buffer must be at least 4 bytes long or a runtime failure may
560      /// occur.
561      ///
562      /// This will then return the number of bytes written to the slice.
563      fn encode_utf8(&self, dst: &mut [u8]) -> uint;
564
565      /// Encodes this character as UTF-16 into the provided `u16` buffer.
566      ///
567      /// The buffer must be at least 2 elements long or a runtime failure may
568      /// occur.
569      ///
570      /// This will then return the number of `u16`s written to the slice.
571      fn encode_utf16(&self, dst: &mut [u16]) -> uint;
572  }
573
574  impl Char for char {
575      fn is_alphabetic(&self) -> bool { is_alphabetic(*self) }
576
577      fn is_XID_start(&self) -> bool { is_XID_start(*self) }
578
579      fn is_XID_continue(&self) -> bool { is_XID_continue(*self) }
580
581      fn is_lowercase(&self) -> bool { is_lowercase(*self) }
582
583      fn is_uppercase(&self) -> bool { is_uppercase(*self) }
584
585      fn is_whitespace(&self) -> bool { is_whitespace(*self) }
586
587      fn is_alphanumeric(&self) -> bool { is_alphanumeric(*self) }
588
589      fn is_control(&self) -> bool { is_control(*self) }
590
591      fn is_digit(&self) -> bool { is_digit(*self) }
592
593      fn is_digit_radix(&self, radix: uint) -> bool { is_digit_radix(*self, radix) }
594
595      fn to_digit(&self, radix: uint) -> Option<uint> { to_digit(*self, radix) }
596
597      fn to_lowercase(&self) -> char { to_lowercase(*self) }
598
599      fn to_uppercase(&self) -> char { to_uppercase(*self) }
600
601      fn from_digit(num: uint, radix: uint) -> Option<char> { from_digit(num, radix) }
602
603      fn escape_unicode(&self, f: |char|) { escape_unicode(*self, f) }
604
605      fn escape_default(&self, f: |char|) { escape_default(*self, f) }
606
607      fn len_utf8_bytes(&self) -> uint { len_utf8_bytes(*self) }
608
609      fn encode_utf8(&self, dst: &mut [u8]) -> uint {
610          let code = *self as uint;
611          if code < MAX_ONE_B {
612              dst[0] = code as u8;
613              return 1;
614          } else if code < MAX_TWO_B {
615              dst[0] = (code >> 6u & 31u | TAG_TWO_B) as u8;
616              dst[1] = (code & 63u | TAG_CONT) as u8;
617              return 2;
618          } else if code < MAX_THREE_B {
619              dst[0] = (code >> 12u & 15u | TAG_THREE_B) as u8;
620              dst[1] = (code >> 6u & 63u | TAG_CONT) as u8;
621              dst[2] = (code & 63u | TAG_CONT) as u8;
622              return 3;
623          } else {
624              dst[0] = (code >> 18u & 7u | TAG_FOUR_B) as u8;
625              dst[1] = (code >> 12u & 63u | TAG_CONT) as u8;
626              dst[2] = (code >> 6u & 63u | TAG_CONT) as u8;
627              dst[3] = (code & 63u | TAG_CONT) as u8;
628              return 4;
629          }
630      }
631
632      fn encode_utf16(&self, dst: &mut [u16]) -> uint {
633          let mut ch = *self as uint;
634          if (ch & 0xFFFF_u) == ch {
635              // The BMP falls through (assuming non-surrogate, as it
636              // should)
637              assert!(ch <= 0xD7FF_u || ch >= 0xE000_u);
638              dst[0] = ch as u16;
639              1
640          } else {
641              // Supplementary planes break into surrogates.
642              assert!(ch >= 0x1_0000_u && ch <= 0x10_FFFF_u);
643              ch -= 0x1_0000_u;
644              dst[0] = 0xD800_u16 | ((ch >> 10) as u16);
645              dst[1] = 0xDC00_u16 | ((ch as u16) & 0x3FF_u16);
646              2
647          }
648      }
649  }
650
651  #[cfg(not(test))]
652  impl Eq for char {
653      #[inline]
654      fn eq(&self, other: &char) -> bool { (*self) == (*other) }
655  }
656
657  #[cfg(not(test))]
658  impl TotalEq for char {}
659
660  #[cfg(not(test))]
661  impl Ord for char {
662      #[inline]
663      fn lt(&self, other: &char) -> bool { *self < *other }
664  }
665
666  #[cfg(not(test))]
667  impl TotalOrd for char {
668      fn cmp(&self, other: &char) -> Ordering {
669          (*self as u32).cmp(&(*other as u32))
670      }
671  }
672
673  #[cfg(not(test))]
674  impl Default for char {
675      #[inline]
676      fn default() -> char { '\x00' }
677  }
678
679  #[cfg(test)]
680  mod test {
681      use super::{escape_unicode, escape_default};
682
683      use realcore::char::Char;
684      use slice::ImmutableVector;
685      use realstd::option::{Some, None};
686      use realstd::strbuf::StrBuf;
687      use realstd::str::StrAllocating;
688
689      #[test]
690      fn test_is_lowercase() {
691          assert!('a'.is_lowercase());
692          assert!('Ã¶'.is_lowercase());
693          assert!('Ã'.is_lowercase());
694          assert!(!'Ã'.is_lowercase());
695          assert!(!'P'.is_lowercase());
696      }
697
698      #[test]
699      fn test_is_uppercase() {
700          assert!(!'h'.is_uppercase());
701          assert!(!'Ã¤'.is_uppercase());
702          assert!(!'Ã'.is_uppercase());
703          assert!('Ã'.is_uppercase());
704          assert!('T'.is_uppercase());
705      }
706
707      #[test]
708      fn test_is_whitespace() {
709          assert!(' '.is_whitespace());
710          assert!('\u2007'.is_whitespace());
711          assert!('\t'.is_whitespace());
712          assert!('\n'.is_whitespace());
713          assert!(!'a'.is_whitespace());
714          assert!(!'_'.is_whitespace());
715          assert!(!'\u0000'.is_whitespace());
716      }
717
718      #[test]
719      fn test_to_digit() {
720          assert_eq!('0'.to_digit(10u), Some(0u));
721          assert_eq!('1'.to_digit(2u), Some(1u));
722          assert_eq!('2'.to_digit(3u), Some(2u));
723          assert_eq!('9'.to_digit(10u), Some(9u));
724          assert_eq!('a'.to_digit(16u), Some(10u));
725          assert_eq!('A'.to_digit(16u), Some(10u));
726          assert_eq!('b'.to_digit(16u), Some(11u));
727          assert_eq!('B'.to_digit(16u), Some(11u));
728          assert_eq!('z'.to_digit(36u), Some(35u));
729          assert_eq!('Z'.to_digit(36u), Some(35u));
730          assert_eq!(' '.to_digit(10u), None);
731          assert_eq!('$'.to_digit(36u), None);
732      }
733
734      #[test]
735      fn test_to_lowercase() {
736          assert_eq!('A'.to_lowercase(), 'a');
737          assert_eq!('Ã'.to_lowercase(), 'Ã¶');
738          assert_eq!('Ã'.to_lowercase(), 'Ã');
739          assert_eq!('Ã'.to_lowercase(), 'Ã¼');
740          assert_eq!('ð©'.to_lowercase(), 'ð©');
741          assert_eq!('Î£'.to_lowercase(), 'Ï');
742          assert_eq!('Î¤'.to_lowercase(), 'Ï');
743          assert_eq!('Î'.to_lowercase(), 'Î¹');
744          assert_eq!('Î'.to_lowercase(), 'Î³');
745          assert_eq!('Î'.to_lowercase(), 'Î¼');
746          assert_eq!('Î'.to_lowercase(), 'Î±');
747          assert_eq!('Î£'.to_lowercase(), 'Ï');
748      }
749
750      #[test]
751      fn test_to_uppercase() {
752          assert_eq!('a'.to_uppercase(), 'A');
753          assert_eq!('Ã¶'.to_uppercase(), 'Ã');
754          assert_eq!('Ã'.to_uppercase(), 'Ã'); // not áº: Latin capital letter sharp s
755          assert_eq!('Ã¼'.to_uppercase(), 'Ã');
756          assert_eq!('ð©'.to_uppercase(), 'ð©');
757
758          assert_eq!('Ï'.to_uppercase(), 'Î£');
759          assert_eq!('Ï'.to_uppercase(), 'Î¤');
760          assert_eq!('Î¹'.to_uppercase(), 'Î');
761          assert_eq!('Î³'.to_uppercase(), 'Î');
762          assert_eq!('Î¼'.to_uppercase(), 'Î');
763          assert_eq!('Î±'.to_uppercase(), 'Î');
764          assert_eq!('Ï'.to_uppercase(), 'Î£');
765      }
766
767      #[test]
768      fn test_is_control() {
769          assert!('\u0000'.is_control());
770          assert!('\u0003'.is_control());
771          assert!('\u0006'.is_control());
772          assert!('\u0009'.is_control());
773          assert!('\u007f'.is_control());
774          assert!('\u0092'.is_control());
775          assert!(!'\u0020'.is_control());
776          assert!(!'\u0055'.is_control());
777          assert!(!'\u0068'.is_control());
778      }
779
780      #[test]
781      fn test_is_digit() {
782         assert!('2'.is_digit());
783         assert!('7'.is_digit());
784         assert!(!'c'.is_digit());
785         assert!(!'i'.is_digit());
786         assert!(!'z'.is_digit());
787         assert!(!'Q'.is_digit());
788      }
789
790      #[test]
791      fn test_escape_default() {
792          fn string(c: char) -> ~str {
793              let mut result = StrBuf::new();
794              escape_default(c, |c| { result.push_char(c); });
795              return result.into_owned();
796          }
797          assert_eq!(string('\n'), "\\n".to_owned());
798          assert_eq!(string('\r'), "\\r".to_owned());
799          assert_eq!(string('\''), "\\'".to_owned());
800          assert_eq!(string('"'), "\\\"".to_owned());
801          assert_eq!(string(' '), " ".to_owned());
802          assert_eq!(string('a'), "a".to_owned());
803          assert_eq!(string('~'), "~".to_owned());
804          assert_eq!(string('\x00'), "\\x00".to_owned());
805          assert_eq!(string('\x1f'), "\\x1f".to_owned());
806          assert_eq!(string('\x7f'), "\\x7f".to_owned());
807          assert_eq!(string('\xff'), "\\xff".to_owned());
808          assert_eq!(string('\u011b'), "\\u011b".to_owned());
809          assert_eq!(string('\U0001d4b6'), "\\U0001d4b6".to_owned());
810      }
811
812      #[test]
813      fn test_escape_unicode() {
814          fn string(c: char) -> ~str {
815              let mut result = StrBuf::new();
816              escape_unicode(c, |c| { result.push_char(c); });
817              return result.into_owned();
818          }
819          assert_eq!(string('\x00'), "\\x00".to_owned());
820          assert_eq!(string('\n'), "\\x0a".to_owned());
821          assert_eq!(string(' '), "\\x20".to_owned());
822          assert_eq!(string('a'), "\\x61".to_owned());
823          assert_eq!(string('\u011b'), "\\u011b".to_owned());
824          assert_eq!(string('\U0001d4b6'), "\\U0001d4b6".to_owned());
825      }
826
827      #[test]
828      fn test_to_str() {
829          use realstd::to_str::ToStr;
830          let s = 't'.to_str();
831          assert_eq!(s, "t".to_owned());
832      }
833
834      #[test]
835      fn test_encode_utf8() {
836          fn check(input: char, expect: &[u8]) {
837              let mut buf = [0u8, ..4];
838              let n = input.encode_utf8(buf /* as mut slice! */);
839              assert_eq!(buf.slice_to(n), expect);
840          }
841
842          check('x', [0x78]);
843          check('\u00e9', [0xc3, 0xa9]);
844          check('\ua66e', [0xea, 0x99, 0xae]);
845          check('\U0001f4a9', [0xf0, 0x9f, 0x92, 0xa9]);
846      }
847
848      #[test]
849      fn test_encode_utf16() {
850          fn check(input: char, expect: &[u16]) {
851              let mut buf = [0u16, ..2];
852              let n = input.encode_utf16(buf /* as mut slice! */);
853              assert_eq!(buf.slice_to(n), expect);
854          }
855
856          check('x', [0x0078]);
857          check('\u00e9', [0x00e9]);
858          check('\ua66e', [0xa66e]);
859          check('\U0001f4a9', [0xd83d, 0xdca9]);
860      }
861  }

libcore/char.rs:210:10-210:10 -fn- definition:
pub fn to_digit(c: char, radix: uint) -> Option<uint> {
    if radix > 36 {
        fail!("to_digit: radix is too high (maximum 36)");
references:- 2
595:     fn to_digit(&self, radix: uint) -> Option<uint> { to_digit(*self, radix) }

libcore/char.rs:298:43-298:43 -fn- definition:
// Decompose a precomposed Hangul syllable
fn decompose_hangul(s: char, f: |char|) {
    let si = s as uint - S_BASE;
references:- 2
320:     } else {
321:         decompose_hangul(c, f);
322:     }
--
329:     } else {
330:         decompose_hangul(c, f);
331:     }

libcore/char.rs:144:10-144:10 -fn- definition:
pub fn is_alphanumeric(c: char) -> bool {
    derived_property::Alphabetic(c)
        || general_category::Nd(c)
references:- 2
587:     fn is_alphanumeric(&self) -> bool { is_alphanumeric(*self) }
libcore/str.rs:
1619:     #[inline]
1620:     fn is_alphanumeric(&self) -> bool { self.chars().all(char::is_alphanumeric) }

libcore/char.rs:130:10-130:10 -fn- definition:
pub fn is_whitespace(c: char) -> bool {
    // As an optimization ASCII whitespace characters are checked separately
    c == ' '
references:- 5
585:     fn is_whitespace(&self) -> bool { is_whitespace(*self) }
libcore/str.rs:
1688:     fn trim_right(&self) -> &'a str {
1689:         self.trim_right_chars(char::is_whitespace)
1690:     }