(index<- )        ./libstd/char.rs

    git branch:    * master           c7553ea auto merge of #13609 : richo/rust/str-type-vim, r=alexcrichton
    modified:    Sat Apr 19 11:22:39 2014

   1  // Copyright 2012-2013 The Rust Project Developers. See the COPYRIGHT
   2  // file at the top-level directory of this distribution and at
   3  // http://rust-lang.org/COPYRIGHT.
   4  //
   5  // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
   6  // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
   7  // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
   8  // option. This file may not be copied, modified, or distributed
   9  // except according to those terms.
  10
  11  //! Character manipulation (`char` type, Unicode Scalar Value)
  12  //!
  13  //! This module  provides the `Char` trait, as well as its implementation
  14  //! for the primitive `char` type, in order to allow basic character manipulation.
  15  //!
  16  //! A `char` actually represents a
  17  //! *[Unicode Scalar Value](http://www.unicode.org/glossary/#unicode_scalar_value)*,
  18  //! as it can contain any Unicode code point except high-surrogate and
  19  //! low-surrogate code points.
  20  //!
  21  //! As such, only values in the ranges \[0x0,0xD7FF\] and \[0xE000,0x10FFFF\]
  22  //! (inclusive) are allowed. A `char` can always be safely cast to a `u32`;
  23  //! however the converse is not always true due to the above range limits
  24  //! and, as such, should be performed via the `from_u32` function..
  25
  26
  27  use cast::transmute;
  28  use option::{None, Option, Some};
  29  use iter::{Iterator, range_step};
  30  use str::StrSlice;
  31  use unicode::{derived_property, property, general_category, decompose, conversions};
  32
  33  #[cfg(test)] use str::Str;
  34  #[cfg(test)] use strbuf::StrBuf;
  35  #[cfg(test)] use slice::ImmutableVector;
  36
  37  #[cfg(not(test))] use cmp::{Eq, Ord};
  38  #[cfg(not(test))] use default::Default;
  39
  40  // UTF-8 ranges and tags for encoding characters
  41  static TAG_CONT: uint = 128u;
  42  static MAX_ONE_B: uint = 128u;
  43  static TAG_TWO_B: uint = 192u;
  44  static MAX_TWO_B: uint = 2048u;
  45  static TAG_THREE_B: uint = 224u;
  46  static MAX_THREE_B: uint = 65536u;
  47  static TAG_FOUR_B: uint = 240u;
  48
  49  /*
  50      Lu  Uppercase_Letter        an uppercase letter
  51      Ll  Lowercase_Letter        a lowercase letter
  52      Lt  Titlecase_Letter        a digraphic character, with first part uppercase
  53      Lm  Modifier_Letter         a modifier letter
  54      Lo  Other_Letter            other letters, including syllables and ideographs
  55      Mn  Nonspacing_Mark         a nonspacing combining mark (zero advance width)
  56      Mc  Spacing_Mark            a spacing combining mark (positive advance width)
  57      Me  Enclosing_Mark          an enclosing combining mark
  58      Nd  Decimal_Number          a decimal digit
  59      Nl  Letter_Number           a letterlike numeric character
  60      No  Other_Number            a numeric character of other type
  61      Pc  Connector_Punctuation   a connecting punctuation mark, like a tie
  62      Pd  Dash_Punctuation        a dash or hyphen punctuation mark
  63      Ps  Open_Punctuation        an opening punctuation mark (of a pair)
  64      Pe  Close_Punctuation       a closing punctuation mark (of a pair)
  65      Pi  Initial_Punctuation     an initial quotation mark
  66      Pf  Final_Punctuation       a final quotation mark
  67      Po  Other_Punctuation       a punctuation mark of other type
  68      Sm  Math_Symbol             a symbol of primarily mathematical use
  69      Sc  Currency_Symbol         a currency sign
  70      Sk  Modifier_Symbol         a non-letterlike modifier symbol
  71      So  Other_Symbol            a symbol of other type
  72      Zs  Space_Separator         a space character (of various non-zero widths)
  73      Zl  Line_Separator          U+2028 LINE SEPARATOR only
  74      Zp  Paragraph_Separator     U+2029 PARAGRAPH SEPARATOR only
  75      Cc  Control                 a C0 or C1 control code
  76      Cf  Format                  a format control character
  77      Cs  Surrogate               a surrogate code point
  78      Co  Private_Use             a private-use character
  79      Cn  Unassigned              a reserved unassigned code point or a noncharacter
  80  */
  81
  82  /// The highest valid code point
  83  pub static MAX: char = '\U0010ffff';
  84
  85  /// Converts from `u32` to a `char`
  86  #[inline]
  87  pub fn from_u32(i: u32) -> Option<char> {
  88      // catch out-of-bounds and surrogates
  89      if (i > MAX as u32) || (i >= 0xD800 && i <= 0xDFFF) {
  90          None
  91      } else {
  92          Some(unsafe { transmute(i) })
  93      }
  94  }
  95
  96  /// Returns whether the specified `char` is considered a Unicode alphabetic
  97  /// code point
  98  pub fn is_alphabetic(c: char) -> bool   { derived_property::Alphabetic(c) }
  99
100  /// Returns whether the specified `char` satisfies the 'XID_Start' Unicode property
101  ///
102  /// 'XID_Start' is a Unicode Derived Property specified in
103  /// [UAX #31](http://unicode.org/reports/tr31/#NFKC_Modifications),
104  /// mostly similar to ID_Start but modified for closure under NFKx.
105  pub fn is_XID_start(c: char) -> bool    { derived_property::XID_Start(c) }
106
107  /// Returns whether the specified `char` satisfies the 'XID_Continue' Unicode property
108  ///
109  /// 'XID_Continue' is a Unicode Derived Property specified in
110  /// [UAX #31](http://unicode.org/reports/tr31/#NFKC_Modifications),
111  /// mostly similar to 'ID_Continue' but modified for closure under NFKx.
112  pub fn is_XID_continue(c: char) -> bool { derived_property::XID_Continue(c) }
113
114  ///
115  /// Indicates whether a `char` is in lower case
116  ///
117  /// This is defined according to the terms of the Unicode Derived Core Property 'Lowercase'.
118  ///
119  #[inline]
120  pub fn is_lowercase(c: char) -> bool { derived_property::Lowercase(c) }
121
122  ///
123  /// Indicates whether a `char` is in upper case
124  ///
125  /// This is defined according to the terms of the Unicode Derived Core Property 'Uppercase'.
126  ///
127  #[inline]
128  pub fn is_uppercase(c: char) -> bool { derived_property::Uppercase(c) }
129
130  ///
131  /// Indicates whether a `char` is whitespace
132  ///
133  /// Whitespace is defined in terms of the Unicode Property 'White_Space'.
134  ///
135  #[inline]
136  pub fn is_whitespace(c: char) -> bool {
137      // As an optimization ASCII whitespace characters are checked separately
138      c == ' '
139          || ('\x09' <= c && c <= '\x0d')
140          || property::White_Space(c)
141  }
142
143  ///
144  /// Indicates whether a `char` is alphanumeric
145  ///
146  /// Alphanumericness is defined in terms of the Unicode General Categories
147  /// 'Nd', 'Nl', 'No' and the Derived Core Property 'Alphabetic'.
148  ///
149  #[inline]
150  pub fn is_alphanumeric(c: char) -> bool {
151      derived_property::Alphabetic(c)
152          || general_category::Nd(c)
153          || general_category::Nl(c)
154          || general_category::No(c)
155  }
156
157  ///
158  /// Indicates whether a `char` is a control code point
159  ///
160  /// Control code points are defined in terms of the Unicode General Category
161  /// 'Cc'.
162  ///
163  #[inline]
164  pub fn is_control(c: char) -> bool { general_category::Cc(c) }
165
166  /// Indicates whether the `char` is numeric (Nd, Nl, or No)
167  #[inline]
168  pub fn is_digit(c: char) -> bool {
169      general_category::Nd(c)
170          || general_category::Nl(c)
171          || general_category::No(c)
172  }
173
174  ///
175  /// Checks if a `char` parses as a numeric digit in the given radix
176  ///
177  /// Compared to `is_digit()`, this function only recognizes the
178  /// characters `0-9`, `a-z` and `A-Z`.
179  ///
180  /// # Return value
181  ///
182  /// Returns `true` if `c` is a valid digit under `radix`, and `false`
183  /// otherwise.
184  ///
185  /// # Failure
186  ///
187  /// Fails if given a `radix` > 36.
188  ///
189  /// # Note
190  ///
191  /// This just wraps `to_digit()`.
192  ///
193  #[inline]
194  pub fn is_digit_radix(c: char, radix: uint) -> bool {
195      match to_digit(c, radix) {
196          Some(_) => true,
197          None    => false,
198      }
199  }
200
201  ///
202  /// Converts a `char` to the corresponding digit
203  ///
204  /// # Return value
205  ///
206  /// If `c` is between '0' and '9', the corresponding value
207  /// between 0 and 9. If `c` is 'a' or 'A', 10. If `c` is
208  /// 'b' or 'B', 11, etc. Returns none if the `char` does not
209  /// refer to a digit in the given radix.
210  ///
211  /// # Failure
212  ///
213  /// Fails if given a `radix` outside the range `[0..36]`.
214  ///
215  #[inline]
216  pub fn to_digit(c: char, radix: uint) -> Option<uint> {
217      if radix > 36 {
218          fail!("to_digit: radix {} is too high (maximum 36)", radix);
219      }
220      let val = match c {
221        '0' .. '9' => c as uint - ('0' as uint),
222        'a' .. 'z' => c as uint + 10u - ('a' as uint),
223        'A' .. 'Z' => c as uint + 10u - ('A' as uint),
224        _ => return None,
225      };
226      if val < radix { Some(val) }
227      else { None }
228  }
229
230  /// Convert a char to its uppercase equivalent
231  ///
232  /// The case-folding performed is the common or simple mapping:
233  /// it maps one unicode codepoint (one char in Rust) to its uppercase equivalent according
234  /// to the Unicode database at ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
235  /// The additional SpecialCasing.txt is not considered here, as it expands to multiple
236  /// codepoints in some cases.
237  ///
238  /// A full reference can be found here
239  /// http://www.unicode.org/versions/Unicode4.0.0/ch03.pdf#G33992
240  ///
241  /// # Return value
242  ///
243  /// Returns the char itself if no conversion was made
244  #[inline]
245  pub fn to_uppercase(c: char) -> char {
246      conversions::to_upper(c)
247  }
248
249  /// Convert a char to its lowercase equivalent
250  ///
251  /// The case-folding performed is the common or simple mapping
252  /// see `to_uppercase` for references and more information
253  ///
254  /// # Return value
255  ///
256  /// Returns the char itself if no conversion if possible
257  #[inline]
258  pub fn to_lowercase(c: char) -> char {
259      conversions::to_lower(c)
260  }
261
262  ///
263  /// Converts a number to the character representing it
264  ///
265  /// # Return value
266  ///
267  /// Returns `Some(char)` if `num` represents one digit under `radix`,
268  /// using one character of `0-9` or `a-z`, or `None` if it doesn't.
269  ///
270  /// # Failure
271  ///
272  /// Fails if given an `radix` > 36.
273  ///
274  #[inline]
275  pub fn from_digit(num: uint, radix: uint) -> Option<char> {
276      if radix > 36 {
277          fail!("from_digit: radix {} is to high (maximum 36)", num);
278      }
279      if num < radix {
280          unsafe {
281              if num < 10 {
282                  Some(transmute(('0' as uint + num) as u32))
283              } else {
284                  Some(transmute(('a' as uint + num - 10u) as u32))
285              }
286          }
287      } else {
288          None
289      }
290  }
291
292  // Constants from Unicode 6.2.0 Section 3.12 Conjoining Jamo Behavior
293  static S_BASE: uint = 0xAC00;
294  static L_BASE: uint = 0x1100;
295  static V_BASE: uint = 0x1161;
296  static T_BASE: uint = 0x11A7;
297  static L_COUNT: uint = 19;
298  static V_COUNT: uint = 21;
299  static T_COUNT: uint = 28;
300  static N_COUNT: uint = (V_COUNT * T_COUNT);
301  static S_COUNT: uint = (L_COUNT * N_COUNT);
302
303  // Decompose a precomposed Hangul syllable
304  fn decompose_hangul(s: char, f: |char|) {
305      let si = s as uint - S_BASE;
306
307      let li = si / N_COUNT;
308      unsafe {
309          f(transmute((L_BASE + li) as u32));
310
311          let vi = (si % N_COUNT) / T_COUNT;
312          f(transmute((V_BASE + vi) as u32));
313
314          let ti = si % T_COUNT;
315          if ti > 0 {
316              f(transmute((T_BASE + ti) as u32));
317          }
318      }
319  }
320
321  /// Returns the canonical decomposition of a character
322  pub fn decompose_canonical(c: char, f: |char|) {
323      if (c as uint) < S_BASE || (c as uint) >= (S_BASE + S_COUNT) {
324          decompose::canonical(c, f);
325      } else {
326          decompose_hangul(c, f);
327      }
328  }
329
330  /// Returns the compatibility decomposition of a character
331  pub fn decompose_compatible(c: char, f: |char|) {
332      if (c as uint) < S_BASE || (c as uint) >= (S_BASE + S_COUNT) {
333          decompose::compatibility(c, f);
334      } else {
335          decompose_hangul(c, f);
336      }
337  }
338
339  ///
340  /// Returns the hexadecimal Unicode escape of a `char`
341  ///
342  /// The rules are as follows:
343  ///
344  /// - chars in [0,0xff] get 2-digit escapes: `\\xNN`
345  /// - chars in [0x100,0xffff] get 4-digit escapes: `\\uNNNN`
346  /// - chars above 0x10000 get 8-digit escapes: `\\UNNNNNNNN`
347  ///
348  pub fn escape_unicode(c: char, f: |char|) {
349      // avoid calling str::to_str_radix because we don't really need to allocate
350      // here.
351      f('\\');
352      let pad = match () {
353          _ if c <= '\xff'    => { f('x'); 2 }
354          _ if c <= '\uffff'  => { f('u'); 4 }
355          _                   => { f('U'); 8 }
356      };
357      for offset in range_step::<i32>(4 * (pad - 1), -1, -4) {
358          unsafe {
359              match ((c as i32) >> offset) & 0xf {
360                  i @ 0 .. 9 => { f(transmute('0' as i32 + i)); }
361                  i => { f(transmute('a' as i32 + (i - 10))); }
362              }
363          }
364      }
365  }
366
367  ///
368  /// Returns a 'default' ASCII and C++11-like literal escape of a `char`
369  ///
370  /// The default is chosen with a bias toward producing literals that are
371  /// legal in a variety of languages, including C++11 and similar C-family
372  /// languages. The exact rules are:
373  ///
374  /// - Tab, CR and LF are escaped as '\t', '\r' and '\n' respectively.
375  /// - Single-quote, double-quote and backslash chars are backslash-escaped.
376  /// - Any other chars in the range [0x20,0x7e] are not escaped.
377  /// - Any other chars are given hex unicode escapes; see `escape_unicode`.
378  ///
379  pub fn escape_default(c: char, f: |char|) {
380      match c {
381          '\t' => { f('\\'); f('t'); }
382          '\r' => { f('\\'); f('r'); }
383          '\n' => { f('\\'); f('n'); }
384          '\\' => { f('\\'); f('\\'); }
385          '\'' => { f('\\'); f('\''); }
386          '"'  => { f('\\'); f('"'); }
387          '\x20' .. '\x7e' => { f(c); }
388          _ => c.escape_unicode(f),
389      }
390  }
391
392  /// Returns the amount of bytes this `char` would need if encoded in UTF-8
393  pub fn len_utf8_bytes(c: char) -> uint {
394      static MAX_ONE_B:   uint = 128u;
395      static MAX_TWO_B:   uint = 2048u;
396      static MAX_THREE_B: uint = 65536u;
397      static MAX_FOUR_B:  uint = 2097152u;
398
399      let code = c as uint;
400      match () {
401          _ if code < MAX_ONE_B   => 1u,
402          _ if code < MAX_TWO_B   => 2u,
403          _ if code < MAX_THREE_B => 3u,
404          _ if code < MAX_FOUR_B  => 4u,
405          _                       => fail!("invalid character!"),
406      }
407  }
408
409  /// Useful functions for Unicode characters.
410  pub trait Char {
411      /// Returns whether the specified character is considered a Unicode
412      /// alphabetic code point.
413      fn is_alphabetic(&self) -> bool;
414
415      /// Returns whether the specified character satisfies the 'XID_Start'
416      /// Unicode property.
417      ///
418      /// 'XID_Start' is a Unicode Derived Property specified in
419      /// [UAX #31](http://unicode.org/reports/tr31/#NFKC_Modifications),
420      /// mostly similar to ID_Start but modified for closure under NFKx.
421      fn is_XID_start(&self) -> bool;
422
423      /// Returns whether the specified `char` satisfies the 'XID_Continue'
424      /// Unicode property.
425      ///
426      /// 'XID_Continue' is a Unicode Derived Property specified in
427      /// [UAX #31](http://unicode.org/reports/tr31/#NFKC_Modifications),
428      /// mostly similar to 'ID_Continue' but modified for closure under NFKx.
429      fn is_XID_continue(&self) -> bool;
430
431
432      /// Indicates whether a character is in lowercase.
433      ///
434      /// This is defined according to the terms of the Unicode Derived Core
435      /// Property `Lowercase`.
436      fn is_lowercase(&self) -> bool;
437
438      /// Indicates whether a character is in uppercase.
439      ///
440      /// This is defined according to the terms of the Unicode Derived Core
441      /// Property `Uppercase`.
442      fn is_uppercase(&self) -> bool;
443
444      /// Indicates whether a character is whitespace.
445      ///
446      /// Whitespace is defined in terms of the Unicode Property `White_Space`.
447      fn is_whitespace(&self) -> bool;
448
449      /// Indicates whether a character is alphanumeric.
450      ///
451      /// Alphanumericness is defined in terms of the Unicode General Categories
452      /// 'Nd', 'Nl', 'No' and the Derived Core Property 'Alphabetic'.
453      fn is_alphanumeric(&self) -> bool;
454
455      /// Indicates whether a character is a control code point.
456      ///
457      /// Control code points are defined in terms of the Unicode General
458      /// Category `Cc`.
459      fn is_control(&self) -> bool;
460
461      /// Indicates whether the character is numeric (Nd, Nl, or No).
462      fn is_digit(&self) -> bool;
463
464      /// Checks if a `char` parses as a numeric digit in the given radix.
465      ///
466      /// Compared to `is_digit()`, this function only recognizes the characters
467      /// `0-9`, `a-z` and `A-Z`.
468      ///
469      /// # Return value
470      ///
471      /// Returns `true` if `c` is a valid digit under `radix`, and `false`
472      /// otherwise.
473      ///
474      /// # Failure
475      ///
476      /// Fails if given a radix > 36.
477      fn is_digit_radix(&self, radix: uint) -> bool;
478
479      /// Converts a character to the corresponding digit.
480      ///
481      /// # Return value
482      ///
483      /// If `c` is between '0' and '9', the corresponding value between 0 and
484      /// 9. If `c` is 'a' or 'A', 10. If `c` is 'b' or 'B', 11, etc. Returns
485      /// none if the character does not refer to a digit in the given radix.
486      ///
487      /// # Failure
488      ///
489      /// Fails if given a radix outside the range [0..36].
490      fn to_digit(&self, radix: uint) -> Option<uint>;
491
492      /// Converts a character to its lowercase equivalent.
493      ///
494      /// The case-folding performed is the common or simple mapping. See
495      /// `to_uppercase()` for references and more information.
496      ///
497      /// # Return value
498      ///
499      /// Returns the lowercase equivalent of the character, or the character
500      /// itself if no conversion is possible.
501      fn to_lowercase(&self) -> char;
502
503      /// Converts a character to its uppercase equivalent.
504      ///
505      /// The case-folding performed is the common or simple mapping: it maps
506      /// one unicode codepoint (one character in Rust) to its uppercase
507      /// equivalent according to the Unicode database [1]. The additional
508      /// `SpecialCasing.txt` is not considered here, as it expands to multiple
509      /// codepoints in some cases.
510      ///
511      /// A full reference can be found here [2].
512      ///
513      /// # Return value
514      ///
515      /// Returns the uppercase equivalent of the character, or the character
516      /// itself if no conversion was made.
517      ///
518      /// [1]: ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
519      ///
520      /// [2]: http://www.unicode.org/versions/Unicode4.0.0/ch03.pdf#G33992
521      fn to_uppercase(&self) -> char;
522
523      /// Converts a number to the character representing it.
524      ///
525      /// # Return value
526      ///
527      /// Returns `Some(char)` if `num` represents one digit under `radix`,
528      /// using one character of `0-9` or `a-z`, or `None` if it doesn't.
529      ///
530      /// # Failure
531      ///
532      /// Fails if given a radix > 36.
533      fn from_digit(num: uint, radix: uint) -> Option<char>;
534
535      /// Returns the hexadecimal Unicode escape of a character.
536      ///
537      /// The rules are as follows:
538      ///
539      /// * Characters in [0,0xff] get 2-digit escapes: `\\xNN`
540      /// * Characters in [0x100,0xffff] get 4-digit escapes: `\\uNNNN`.
541      /// * Characters above 0x10000 get 8-digit escapes: `\\UNNNNNNNN`.
542      fn escape_unicode(&self, f: |char|);
543
544      /// Returns a 'default' ASCII and C++11-like literal escape of a
545      /// character.
546      ///
547      /// The default is chosen with a bias toward producing literals that are
548      /// legal in a variety of languages, including C++11 and similar C-family
549      /// languages. The exact rules are:
550      ///
551      /// * Tab, CR and LF are escaped as '\t', '\r' and '\n' respectively.
552      /// * Single-quote, double-quote and backslash chars are backslash-
553      ///   escaped.
554      /// * Any other chars in the range [0x20,0x7e] are not escaped.
555      /// * Any other chars are given hex unicode escapes; see `escape_unicode`.
556      fn escape_default(&self, f: |char|);
557
558      /// Returns the amount of bytes this character would need if encoded in
559      /// UTF-8.
560      fn len_utf8_bytes(&self) -> uint;
561
562      /// Encodes this character as UTF-8 into the provided byte buffer.
563      ///
564      /// The buffer must be at least 4 bytes long or a runtime failure may
565      /// occur.
566      ///
567      /// This will then return the number of bytes written to the slice.
568      fn encode_utf8(&self, dst: &mut [u8]) -> uint;
569
570      /// Encodes this character as UTF-16 into the provided `u16` buffer.
571      ///
572      /// The buffer must be at least 2 elements long or a runtime failure may
573      /// occur.
574      ///
575      /// This will then return the number of `u16`s written to the slice.
576      fn encode_utf16(&self, dst: &mut [u16]) -> uint;
577  }
578
579  impl Char for char {
580      fn is_alphabetic(&self) -> bool { is_alphabetic(*self) }
581
582      fn is_XID_start(&self) -> bool { is_XID_start(*self) }
583
584      fn is_XID_continue(&self) -> bool { is_XID_continue(*self) }
585
586      fn is_lowercase(&self) -> bool { is_lowercase(*self) }
587
588      fn is_uppercase(&self) -> bool { is_uppercase(*self) }
589
590      fn is_whitespace(&self) -> bool { is_whitespace(*self) }
591
592      fn is_alphanumeric(&self) -> bool { is_alphanumeric(*self) }
593
594      fn is_control(&self) -> bool { is_control(*self) }
595
596      fn is_digit(&self) -> bool { is_digit(*self) }
597
598      fn is_digit_radix(&self, radix: uint) -> bool { is_digit_radix(*self, radix) }
599
600      fn to_digit(&self, radix: uint) -> Option<uint> { to_digit(*self, radix) }
601
602      fn to_lowercase(&self) -> char { to_lowercase(*self) }
603
604      fn to_uppercase(&self) -> char { to_uppercase(*self) }
605
606      fn from_digit(num: uint, radix: uint) -> Option<char> { from_digit(num, radix) }
607
608      fn escape_unicode(&self, f: |char|) { escape_unicode(*self, f) }
609
610      fn escape_default(&self, f: |char|) { escape_default(*self, f) }
611
612      fn len_utf8_bytes(&self) -> uint { len_utf8_bytes(*self) }
613
614      fn encode_utf8(&self, dst: &mut [u8]) -> uint {
615          let code = *self as uint;
616          if code < MAX_ONE_B {
617              dst[0] = code as u8;
618              return 1;
619          } else if code < MAX_TWO_B {
620              dst[0] = (code >> 6u & 31u | TAG_TWO_B) as u8;
621              dst[1] = (code & 63u | TAG_CONT) as u8;
622              return 2;
623          } else if code < MAX_THREE_B {
624              dst[0] = (code >> 12u & 15u | TAG_THREE_B) as u8;
625              dst[1] = (code >> 6u & 63u | TAG_CONT) as u8;
626              dst[2] = (code & 63u | TAG_CONT) as u8;
627              return 3;
628          } else {
629              dst[0] = (code >> 18u & 7u | TAG_FOUR_B) as u8;
630              dst[1] = (code >> 12u & 63u | TAG_CONT) as u8;
631              dst[2] = (code >> 6u & 63u | TAG_CONT) as u8;
632              dst[3] = (code & 63u | TAG_CONT) as u8;
633              return 4;
634          }
635      }
636
637      fn encode_utf16(&self, dst: &mut [u16]) -> uint {
638          let mut ch = *self as uint;
639          if (ch & 0xFFFF_u) == ch {
640              // The BMP falls through (assuming non-surrogate, as it
641              // should)
642              assert!(ch <= 0xD7FF_u || ch >= 0xE000_u);
643              dst[0] = ch as u16;
644              1
645          } else {
646              // Supplementary planes break into surrogates.
647              assert!(ch >= 0x1_0000_u && ch <= 0x10_FFFF_u);
648              ch -= 0x1_0000_u;
649              dst[0] = 0xD800_u16 | ((ch >> 10) as u16);
650              dst[1] = 0xDC00_u16 | ((ch as u16) & 0x3FF_u16);
651              2
652          }
653      }
654  }
655
656  #[cfg(not(test))]
657  impl Eq for char {
658      #[inline]
659      fn eq(&self, other: &char) -> bool { (*self) == (*other) }
660  }
661
662  #[cfg(not(test))]
663  impl Ord for char {
664      #[inline]
665      fn lt(&self, other: &char) -> bool { *self < *other }
666  }
667
668  #[cfg(not(test))]
669  impl Default for char {
670      #[inline]
671      fn default() -> char { '\x00' }
672  }
673
674  #[test]
675  fn test_is_lowercase() {
676      assert!('a'.is_lowercase());
677      assert!('Ã¶'.is_lowercase());
678      assert!('Ã'.is_lowercase());
679      assert!(!'Ã'.is_lowercase());
680      assert!(!'P'.is_lowercase());
681  }
682
683  #[test]
684  fn test_is_uppercase() {
685      assert!(!'h'.is_uppercase());
686      assert!(!'Ã¤'.is_uppercase());
687      assert!(!'Ã'.is_uppercase());
688      assert!('Ã'.is_uppercase());
689      assert!('T'.is_uppercase());
690  }
691
692  #[test]
693  fn test_is_whitespace() {
694      assert!(' '.is_whitespace());
695      assert!('\u2007'.is_whitespace());
696      assert!('\t'.is_whitespace());
697      assert!('\n'.is_whitespace());
698      assert!(!'a'.is_whitespace());
699      assert!(!'_'.is_whitespace());
700      assert!(!'\u0000'.is_whitespace());
701  }
702
703  #[test]
704  fn test_to_digit() {
705      assert_eq!('0'.to_digit(10u), Some(0u));
706      assert_eq!('1'.to_digit(2u), Some(1u));
707      assert_eq!('2'.to_digit(3u), Some(2u));
708      assert_eq!('9'.to_digit(10u), Some(9u));
709      assert_eq!('a'.to_digit(16u), Some(10u));
710      assert_eq!('A'.to_digit(16u), Some(10u));
711      assert_eq!('b'.to_digit(16u), Some(11u));
712      assert_eq!('B'.to_digit(16u), Some(11u));
713      assert_eq!('z'.to_digit(36u), Some(35u));
714      assert_eq!('Z'.to_digit(36u), Some(35u));
715      assert_eq!(' '.to_digit(10u), None);
716      assert_eq!('$'.to_digit(36u), None);
717  }
718
719  #[test]
720  fn test_to_lowercase() {
721      assert_eq!('A'.to_lowercase(), 'a');
722      assert_eq!('Ã'.to_lowercase(), 'Ã¶');
723      assert_eq!('Ã'.to_lowercase(), 'Ã');
724      assert_eq!('Ã'.to_lowercase(), 'Ã¼');
725      assert_eq!('ð©'.to_lowercase(), 'ð©');
726      assert_eq!('Î£'.to_lowercase(), 'Ï');
727      assert_eq!('Î¤'.to_lowercase(), 'Ï');
728      assert_eq!('Î'.to_lowercase(), 'Î¹');
729      assert_eq!('Î'.to_lowercase(), 'Î³');
730      assert_eq!('Î'.to_lowercase(), 'Î¼');
731      assert_eq!('Î'.to_lowercase(), 'Î±');
732      assert_eq!('Î£'.to_lowercase(), 'Ï');
733  }
734
735  #[test]
736  fn test_to_uppercase() {
737      assert_eq!('a'.to_uppercase(), 'A');
738      assert_eq!('Ã¶'.to_uppercase(), 'Ã');
739      assert_eq!('Ã'.to_uppercase(), 'Ã'); // not áº: Latin capital letter sharp s
740      assert_eq!('Ã¼'.to_uppercase(), 'Ã');
741      assert_eq!('ð©'.to_uppercase(), 'ð©');
742
743      assert_eq!('Ï'.to_uppercase(), 'Î£');
744      assert_eq!('Ï'.to_uppercase(), 'Î¤');
745      assert_eq!('Î¹'.to_uppercase(), 'Î');
746      assert_eq!('Î³'.to_uppercase(), 'Î');
747      assert_eq!('Î¼'.to_uppercase(), 'Î');
748      assert_eq!('Î±'.to_uppercase(), 'Î');
749      assert_eq!('Ï'.to_uppercase(), 'Î£');
750  }
751
752  #[test]
753  fn test_is_control() {
754      assert!('\u0000'.is_control());
755      assert!('\u0003'.is_control());
756      assert!('\u0006'.is_control());
757      assert!('\u0009'.is_control());
758      assert!('\u007f'.is_control());
759      assert!('\u0092'.is_control());
760      assert!(!'\u0020'.is_control());
761      assert!(!'\u0055'.is_control());
762      assert!(!'\u0068'.is_control());
763  }
764
765  #[test]
766  fn test_is_digit() {
767     assert!('2'.is_digit());
768     assert!('7'.is_digit());
769     assert!(!'c'.is_digit());
770     assert!(!'i'.is_digit());
771     assert!(!'z'.is_digit());
772     assert!(!'Q'.is_digit());
773  }
774
775  #[test]
776  fn test_escape_default() {
777      fn string(c: char) -> ~str {
778          let mut result = StrBuf::new();
779          escape_default(c, |c| { result.push_char(c); });
780          return result.into_owned();
781      }
782      assert_eq!(string('\n'), "\\n".to_owned());
783      assert_eq!(string('\r'), "\\r".to_owned());
784      assert_eq!(string('\''), "\\'".to_owned());
785      assert_eq!(string('"'), "\\\"".to_owned());
786      assert_eq!(string(' '), " ".to_owned());
787      assert_eq!(string('a'), "a".to_owned());
788      assert_eq!(string('~'), "~".to_owned());
789      assert_eq!(string('\x00'), "\\x00".to_owned());
790      assert_eq!(string('\x1f'), "\\x1f".to_owned());
791      assert_eq!(string('\x7f'), "\\x7f".to_owned());
792      assert_eq!(string('\xff'), "\\xff".to_owned());
793      assert_eq!(string('\u011b'), "\\u011b".to_owned());
794      assert_eq!(string('\U0001d4b6'), "\\U0001d4b6".to_owned());
795  }
796
797  #[test]
798  fn test_escape_unicode() {
799      fn string(c: char) -> ~str {
800          let mut result = StrBuf::new();
801          escape_unicode(c, |c| { result.push_char(c); });
802          return result.into_owned();
803      }
804      assert_eq!(string('\x00'), "\\x00".to_owned());
805      assert_eq!(string('\n'), "\\x0a".to_owned());
806      assert_eq!(string(' '), "\\x20".to_owned());
807      assert_eq!(string('a'), "\\x61".to_owned());
808      assert_eq!(string('\u011b'), "\\u011b".to_owned());
809      assert_eq!(string('\U0001d4b6'), "\\U0001d4b6".to_owned());
810  }
811
812  #[test]
813  fn test_to_str() {
814      use to_str::ToStr;
815      let s = 't'.to_str();
816      assert_eq!(s, "t".to_owned());
817  }
818
819  #[test]
820  fn test_encode_utf8() {
821      fn check(input: char, expect: &[u8]) {
822          let mut buf = [0u8, ..4];
823          let n = input.encode_utf8(buf /* as mut slice! */);
824          assert_eq!(buf.slice_to(n), expect);
825      }
826
827      check('x', [0x78]);
828      check('\u00e9', [0xc3, 0xa9]);
829      check('\ua66e', [0xea, 0x99, 0xae]);
830      check('\U0001f4a9', [0xf0, 0x9f, 0x92, 0xa9]);
831  }
832
833  #[test]
834  fn test_encode_utf16() {
835      fn check(input: char, expect: &[u16]) {
836          let mut buf = [0u16, ..2];
837          let n = input.encode_utf16(buf /* as mut slice! */);
838          assert_eq!(buf.slice_to(n), expect);
839      }
840
841      check('x', [0x0078]);
842      check('\u00e9', [0x00e9]);
843      check('\ua66e', [0xa66e]);
844      check('\U0001f4a9', [0xd83d, 0xdca9]);
845  }

libstd/char.rs:135:10-135:10 -fn- definition:
pub fn is_whitespace(c: char) -> bool {
    // As an optimization ASCII whitespace characters are checked separately
    c == ' '
references:- 6
590:     fn is_whitespace(&self) -> bool { is_whitespace(*self) }
libstd/str.rs:
2382:     fn words(&self) -> Words<'a> {
2383:         self.split(char::is_whitespace).filter(|s| !s.is_empty())
2384:     }
--
2494:     fn trim_right(&self) -> &'a str {
2495:         self.trim_right_chars(&char::is_whitespace)
2496:     }
libstd/fmt/parse.rs:
291:             match self.cur.clone().next() {
292:                 Some((_, c)) if char::is_whitespace(c) => { self.cur.next(); }
293:                 Some(..) | None => { return }
libstd/str.rs:
2406:     #[inline]
2407:     fn is_whitespace(&self) -> bool { self.chars().all(char::is_whitespace) }

libstd/char.rs:303:43-303:43 -fn- definition:
// Decompose a precomposed Hangul syllable
fn decompose_hangul(s: char, f: |char|) {
    let si = s as uint - S_BASE;
references:- 2
334:     } else {
335:         decompose_hangul(c, f);
336:     }

libstd/char.rs:104:68-104:68 -fn- definition:
/// mostly similar to ID_Start but modified for closure under NFKx.
pub fn is_XID_start(c: char) -> bool    { derived_property::XID_Start(c) }
/// Returns whether the specified `char` satisfies the 'XID_Continue' Unicode property
references:- 2
582:     fn is_XID_start(&self) -> bool { is_XID_start(*self) }
libstd/fmt/parse.rs:
634:         let start = match self.cur.clone().next() {
635:             Some((pos, c)) if char::is_XID_start(c) => {
636:                 self.cur.next();

libstd/char.rs:274:10-274:10 -fn- definition:
pub fn from_digit(num: uint, radix: uint) -> Option<char> {
    if radix > 36 {
        fail!("from_digit: radix {} is to high (maximum 36)", num);
references:- 4
606:     fn from_digit(num: uint, radix: uint) -> Option<char> { from_digit(num, radix) }
libstd/num/strconv.rs:
390:             buf.push(char::from_digit(
391:                 current_digit.to_int().unwrap() as uint, radix).unwrap() as u8);
--
405:             let value2ascii = |val: uint| {
406:                 char::from_digit(val, radix).unwrap() as u8
407:             };

libstd/char.rs:149:10-149:10 -fn- definition:
pub fn is_alphanumeric(c: char) -> bool {
    derived_property::Alphabetic(c)
        || general_category::Nd(c)
references:- 2
592:     fn is_alphanumeric(&self) -> bool { is_alphanumeric(*self) }
libstd/str.rs:
2409:     #[inline]
2410:     fn is_alphanumeric(&self) -> bool { self.chars().all(char::is_alphanumeric) }

libstd/char.rs:347:4-347:4 -fn- definition:
///
pub fn escape_unicode(c: char, f: |char|) {
    // avoid calling str::to_str_radix because we don't really need to allocate
references:- 2
608:     fn escape_unicode(&self, f: |char|) { escape_unicode(*self, f) }
libstd/repr.rs:
251:             _ => {
252:                 char::escape_unicode(ch, |c| {
253:                     let _ = self.writer.write([c as u8]);

libstd/char.rs:97:15-97:15 -fn- definition:
/// code point
pub fn is_alphabetic(c: char) -> bool   { derived_property::Alphabetic(c) }
/// Returns whether the specified `char` satisfies the 'XID_Start' Unicode property
references:- 4
libstd/path/windows.rs:
990:                     let c = path[0];
991:                     if c.is_ascii() && ::char::is_alphabetic(c as char) {
992:                         // \\?\C:\ path
--
1014:         let c = path[0];
1015:         if c.is_ascii() && ::char::is_alphabetic(c as char) {
1016:             return Some(DiskPrefix);
libstd/fmt/parse.rs:
351:                 match self.cur.clone().next() {
352:                     Some((_, c)) if char::is_alphabetic(c) => {
353:                         ArgumentNamed(self.word())
libstd/char.rs:
579: impl Char for char {
580:     fn is_alphabetic(&self) -> bool { is_alphabetic(*self) }

libstd/char.rs:111:73-111:73 -fn- definition:
/// mostly similar to 'ID_Continue' but modified for closure under NFKx.
pub fn is_XID_continue(c: char) -> bool { derived_property::XID_Continue(c) }
///
references:- 2
584:     fn is_XID_continue(&self) -> bool { is_XID_continue(*self) }
libstd/fmt/parse.rs:
643:             match self.cur.clone().next() {
644:                 Some((_, c)) if char::is_XID_continue(c) => {
645:                     self.cur.next();

libstd/char.rs:215:10-215:10 -fn- definition:
pub fn to_digit(c: char, radix: uint) -> Option<uint> {
    if radix > 36 {
        fail!("to_digit: radix {} is too high (maximum 36)", radix);
references:- 6
194: pub fn is_digit_radix(c: char, radix: uint) -> bool {
195:     match to_digit(c, radix) {
196:         Some(_) => true,
libstd/num/strconv.rs:
676:             match char::to_digit(c, radix) {
677:                 Some(digit) => {
libstd/fmt/parse.rs:
661:                 Some((_, c)) => {
662:                     match char::to_digit(c, 10) {
663:                         Some(i) => {
libstd/char.rs:
600:     fn to_digit(&self, radix: uint) -> Option<uint> { to_digit(*self, radix) }