Frames | No Frames |
1: /* java.lang.Character -- Wrapper class for char, and Unicode subsets 2: Copyright (C) 1998, 1999, 2001, 2002, 2005, 2006 Free Software Foundation, Inc. 3: 4: This file is part of GNU Classpath. 5: 6: GNU Classpath is free software; you can redistribute it and/or modify 7: it under the terms of the GNU General Public License as published by 8: the Free Software Foundation; either version 2, or (at your option) 9: any later version. 10: 11: GNU Classpath is distributed in the hope that it will be useful, but 12: WITHOUT ANY WARRANTY; without even the implied warranty of 13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14: General Public License for more details. 15: 16: You should have received a copy of the GNU General Public License 17: along with GNU Classpath; see the file COPYING. If not, write to the 18: Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 19: 02110-1301 USA. 20: 21: Linking this library statically or dynamically with other modules is 22: making a combined work based on this library. Thus, the terms and 23: conditions of the GNU General Public License cover the whole 24: combination. 25: 26: As a special exception, the copyright holders of this library give you 27: permission to link this library with independent modules to produce an 28: executable, regardless of the license terms of these independent 29: modules, and to copy and distribute the resulting executable under 30: terms of your choice, provided that you also meet, for each linked 31: independent module, the terms and conditions of the license of that 32: module. An independent module is a module which is not derived from 33: or based on this library. If you modify this library, you may extend 34: this exception to your version of the library, but you are not 35: obligated to do so. If you do not wish to do so, delete this 36: exception statement from your version. */ 37: 38: /* 39: * Note: This class must not be merged with Classpath. Gcj uses C-style 40: * arrays (see include/java-chartables.h) to store the Unicode character 41: * database, whereas Classpath uses Java objects (char[] extracted from 42: * String constants) in gnu.java.lang.CharData. Gcj's approach is more 43: * efficient, because there is no vtable or data relocation to worry about. 44: * However, despite the difference in the database interface, the two 45: * versions share identical algorithms. 46: */ 47: 48: package java.lang; 49: 50: import java.io.Serializable; 51: import java.text.Collator; 52: import java.util.Locale; 53: 54: /** 55: * Wrapper class for the primitive char data type. In addition, this class 56: * allows one to retrieve property information and perform transformations 57: * on the 57,707 defined characters in the Unicode Standard, Version 3.0.0. 58: * java.lang.Character is designed to be very dynamic, and as such, it 59: * retrieves information on the Unicode character set from a separate 60: * database, gnu.java.lang.CharData, which can be easily upgraded. 61: * 62: * <p>For predicates, boundaries are used to describe 63: * the set of characters for which the method will return true. 64: * This syntax uses fairly normal regular expression notation. 65: * See 5.13 of the Unicode Standard, Version 3.0, for the 66: * boundary specification. 67: * 68: * <p>See <a href="http://www.unicode.org">http://www.unicode.org</a> 69: * for more information on the Unicode Standard. 70: * 71: * @author Tom Tromey (tromey@cygnus.com) 72: * @author Paul N. Fisher 73: * @author Jochen Hoenicke 74: * @author Eric Blake (ebb9@email.byu.edu) 75: * @since 1.0 76: * @status updated to 1.4 77: */ 78: public final class Character implements Serializable, Comparable 79: { 80: /** 81: * A subset of Unicode blocks. 82: * 83: * @author Paul N. Fisher 84: * @author Eric Blake (ebb9@email.byu.edu) 85: * @since 1.2 86: */ 87: public static class Subset 88: { 89: /** The name of the subset. */ 90: private final String name; 91: 92: /** 93: * Construct a new subset of characters. 94: * 95: * @param name the name of the subset 96: * @throws NullPointerException if name is null 97: */ 98: protected Subset(String name) 99: { 100: // Note that name.toString() is name, unless name was null. 101: this.name = name.toString(); 102: } 103: 104: /** 105: * Compares two Subsets for equality. This is <code>final</code>, and 106: * restricts the comparison on the <code>==</code> operator, so it returns 107: * true only for the same object. 108: * 109: * @param o the object to compare 110: * @return true if o is this 111: */ 112: public final boolean equals(Object o) 113: { 114: return o == this; 115: } 116: 117: /** 118: * Makes the original hashCode of Object final, to be consistent with 119: * equals. 120: * 121: * @return the hash code for this object 122: */ 123: public final int hashCode() 124: { 125: return super.hashCode(); 126: } 127: 128: /** 129: * Returns the name of the subset. 130: * 131: * @return the name 132: */ 133: public final String toString() 134: { 135: return name; 136: } 137: } // class Subset 138: 139: /** 140: * A family of character subsets in the Unicode specification. A character 141: * is in at most one of these blocks. 142: * 143: * This inner class was generated automatically from 144: * <code>libjava/gnu/gcj/convert/Blocks-3.txt</code>, by some perl scripts. 145: * This Unicode definition file can be found on the 146: * <a href="http://www.unicode.org">http://www.unicode.org</a> website. 147: * JDK 1.4 uses Unicode version 3.0.0. 148: * 149: * @author scripts/unicode-blocks.pl (written by Eric Blake) 150: * @since 1.2 151: */ 152: public static final class UnicodeBlock extends Subset 153: { 154: /** The start of the subset. */ 155: private final int start; 156: 157: /** The end of the subset. */ 158: private final int end; 159: 160: /** The canonical name of the block according to the Unicode standard. */ 161: private final String canonicalName; 162: 163: /** Constants for the <code>forName()</code> method */ 164: private static final int CANONICAL_NAME = 0; 165: private static final int NO_SPACES_NAME = 1; 166: private static final int CONSTANT_NAME = 2; 167: 168: /** 169: * Constructor for strictly defined blocks. 170: * 171: * @param start the start character of the range 172: * @param end the end character of the range 173: * @param name the block name 174: */ 175: private UnicodeBlock(int start, int end, String name, 176: String canonicalName) 177: { 178: super(name); 179: this.start = start; 180: this.end = end; 181: this.canonicalName = canonicalName; 182: } 183: 184: /** 185: * Returns the Unicode character block which a character belongs to. 186: * <strong>Note</strong>: This method does not support the use of 187: * supplementary characters. For such support, <code>of(int)</code> 188: * should be used instead. 189: * 190: * @param ch the character to look up 191: * @return the set it belongs to, or null if it is not in one 192: */ 193: public static UnicodeBlock of(char ch) 194: { 195: return of((int) ch); 196: } 197: 198: /** 199: * Returns the Unicode character block which a code point belongs to. 200: * 201: * @param codePoint the character to look up 202: * @return the set it belongs to, or null if it is not in one. 203: * @throws IllegalArgumentException if the specified code point is 204: * invalid. 205: * @since 1.5 206: */ 207: public static UnicodeBlock of(int codePoint) 208: { 209: if (codePoint > MAX_CODE_POINT) 210: throw new IllegalArgumentException("The supplied integer value is " + 211: "too large to be a codepoint."); 212: // Simple binary search for the correct block. 213: int low = 0; 214: int hi = sets.length - 1; 215: while (low <= hi) 216: { 217: int mid = (low + hi) >> 1; 218: UnicodeBlock b = sets[mid]; 219: if (codePoint < b.start) 220: hi = mid - 1; 221: else if (codePoint > b.end) 222: low = mid + 1; 223: else 224: return b; 225: } 226: return null; 227: } 228: 229: /** 230: * <p> 231: * Returns the <code>UnicodeBlock</code> with the given name, as defined 232: * by the Unicode standard. The version of Unicode in use is defined by 233: * the <code>Character</code> class, and the names are given in the 234: * <code>Blocks-<version>.txt</code> file corresponding to that version. 235: * The name may be specified in one of three ways: 236: * </p> 237: * <ol> 238: * <li>The canonical, human-readable name used by the Unicode standard. 239: * This is the name with all spaces and hyphens retained. For example, 240: * `Basic Latin' retrieves the block, UnicodeBlock.BASIC_LATIN.</li> 241: * <li>The canonical name with all spaces removed e.g. `BasicLatin'.</li> 242: * <li>The name used for the constants specified by this class, which 243: * is the canonical name with all spaces and hyphens replaced with 244: * underscores e.g. `BASIC_LATIN'</li> 245: * </ol> 246: * <p> 247: * The names are compared case-insensitively using the case comparison 248: * associated with the U.S. English locale. The method recognises the 249: * previous names used for blocks as well as the current ones. At 250: * present, this simply means that the deprecated `SURROGATES_AREA' 251: * will be recognised by this method (the <code>of()</code> methods 252: * only return one of the three new surrogate blocks). 253: * </p> 254: * 255: * @param blockName the name of the block to look up. 256: * @return the specified block. 257: * @throws NullPointerException if the <code>blockName</code> is 258: * <code>null</code>. 259: * @throws IllegalArgumentException if the name does not match any Unicode 260: * block. 261: * @since 1.5 262: */ 263: public static final UnicodeBlock forName(String blockName) 264: { 265: int type; 266: if (blockName.indexOf(' ') != -1) 267: type = CANONICAL_NAME; 268: else if (blockName.indexOf('_') != -1) 269: type = CONSTANT_NAME; 270: else 271: type = NO_SPACES_NAME; 272: Collator usCollator = Collator.getInstance(Locale.US); 273: usCollator.setStrength(Collator.PRIMARY); 274: /* Special case for deprecated blocks not in sets */ 275: switch (type) 276: { 277: case CANONICAL_NAME: 278: if (usCollator.compare(blockName, "Surrogates Area") == 0) 279: return SURROGATES_AREA; 280: break; 281: case NO_SPACES_NAME: 282: if (usCollator.compare(blockName, "SurrogatesArea") == 0) 283: return SURROGATES_AREA; 284: break; 285: case CONSTANT_NAME: 286: if (usCollator.compare(blockName, "SURROGATES_AREA") == 0) 287: return SURROGATES_AREA; 288: break; 289: } 290: /* Other cases */ 291: int setLength = sets.length; 292: switch (type) 293: { 294: case CANONICAL_NAME: 295: for (int i = 0; i < setLength; i++) 296: { 297: UnicodeBlock block = sets[i]; 298: if (usCollator.compare(blockName, block.canonicalName) == 0) 299: return block; 300: } 301: break; 302: case NO_SPACES_NAME: 303: for (int i = 0; i < setLength; i++) 304: { 305: UnicodeBlock block = sets[i]; 306: String nsName = block.canonicalName.replaceAll(" ",""); 307: if (usCollator.compare(blockName, nsName) == 0) 308: return block; 309: } 310: break; 311: case CONSTANT_NAME: 312: for (int i = 0; i < setLength; i++) 313: { 314: UnicodeBlock block = sets[i]; 315: if (usCollator.compare(blockName, block.toString()) == 0) 316: return block; 317: } 318: break; 319: } 320: throw new IllegalArgumentException("No Unicode block found for " + 321: blockName + "."); 322: } 323: 324: /** 325: * Basic Latin. 326: * 0x0000 - 0x007F. 327: */ 328: public static final UnicodeBlock BASIC_LATIN 329: = new UnicodeBlock(0x0000, 0x007F, 330: "BASIC_LATIN", 331: "Basic Latin"); 332: 333: /** 334: * Latin-1 Supplement. 335: * 0x0080 - 0x00FF. 336: */ 337: public static final UnicodeBlock LATIN_1_SUPPLEMENT 338: = new UnicodeBlock(0x0080, 0x00FF, 339: "LATIN_1_SUPPLEMENT", 340: "Latin-1 Supplement"); 341: 342: /** 343: * Latin Extended-A. 344: * 0x0100 - 0x017F. 345: */ 346: public static final UnicodeBlock LATIN_EXTENDED_A 347: = new UnicodeBlock(0x0100, 0x017F, 348: "LATIN_EXTENDED_A", 349: "Latin Extended-A"); 350: 351: /** 352: * Latin Extended-B. 353: * 0x0180 - 0x024F. 354: */ 355: public static final UnicodeBlock LATIN_EXTENDED_B 356: = new UnicodeBlock(0x0180, 0x024F, 357: "LATIN_EXTENDED_B", 358: "Latin Extended-B"); 359: 360: /** 361: * IPA Extensions. 362: * 0x0250 - 0x02AF. 363: */ 364: public static final UnicodeBlock IPA_EXTENSIONS 365: = new UnicodeBlock(0x0250, 0x02AF, 366: "IPA_EXTENSIONS", 367: "IPA Extensions"); 368: 369: /** 370: * Spacing Modifier Letters. 371: * 0x02B0 - 0x02FF. 372: */ 373: public static final UnicodeBlock SPACING_MODIFIER_LETTERS 374: = new UnicodeBlock(0x02B0, 0x02FF, 375: "SPACING_MODIFIER_LETTERS", 376: "Spacing Modifier Letters"); 377: 378: /** 379: * Combining Diacritical Marks. 380: * 0x0300 - 0x036F. 381: */ 382: public static final UnicodeBlock COMBINING_DIACRITICAL_MARKS 383: = new UnicodeBlock(0x0300, 0x036F, 384: "COMBINING_DIACRITICAL_MARKS", 385: "Combining Diacritical Marks"); 386: 387: /** 388: * Greek. 389: * 0x0370 - 0x03FF. 390: */ 391: public static final UnicodeBlock GREEK 392: = new UnicodeBlock(0x0370, 0x03FF, 393: "GREEK", 394: "Greek"); 395: 396: /** 397: * Cyrillic. 398: * 0x0400 - 0x04FF. 399: */ 400: public static final UnicodeBlock CYRILLIC 401: = new UnicodeBlock(0x0400, 0x04FF, 402: "CYRILLIC", 403: "Cyrillic"); 404: 405: /** 406: * Cyrillic Supplementary. 407: * 0x0500 - 0x052F. 408: * @since 1.5 409: */ 410: public static final UnicodeBlock CYRILLIC_SUPPLEMENTARY 411: = new UnicodeBlock(0x0500, 0x052F, 412: "CYRILLIC_SUPPLEMENTARY", 413: "Cyrillic Supplementary"); 414: 415: /** 416: * Armenian. 417: * 0x0530 - 0x058F. 418: */ 419: public static final UnicodeBlock ARMENIAN 420: = new UnicodeBlock(0x0530, 0x058F, 421: "ARMENIAN", 422: "Armenian"); 423: 424: /** 425: * Hebrew. 426: * 0x0590 - 0x05FF. 427: */ 428: public static final UnicodeBlock HEBREW 429: = new UnicodeBlock(0x0590, 0x05FF, 430: "HEBREW", 431: "Hebrew"); 432: 433: /** 434: * Arabic. 435: * 0x0600 - 0x06FF. 436: */ 437: public static final UnicodeBlock ARABIC 438: = new UnicodeBlock(0x0600, 0x06FF, 439: "ARABIC", 440: "Arabic"); 441: 442: /** 443: * Syriac. 444: * 0x0700 - 0x074F. 445: * @since 1.4 446: */ 447: public static final UnicodeBlock SYRIAC 448: = new UnicodeBlock(0x0700, 0x074F, 449: "SYRIAC", 450: "Syriac"); 451: 452: /** 453: * Thaana. 454: * 0x0780 - 0x07BF. 455: * @since 1.4 456: */ 457: public static final UnicodeBlock THAANA 458: = new UnicodeBlock(0x0780, 0x07BF, 459: "THAANA", 460: "Thaana"); 461: 462: /** 463: * Devanagari. 464: * 0x0900 - 0x097F. 465: */ 466: public static final UnicodeBlock DEVANAGARI 467: = new UnicodeBlock(0x0900, 0x097F, 468: "DEVANAGARI", 469: "Devanagari"); 470: 471: /** 472: * Bengali. 473: * 0x0980 - 0x09FF. 474: */ 475: public static final UnicodeBlock BENGALI 476: = new UnicodeBlock(0x0980, 0x09FF, 477: "BENGALI", 478: "Bengali"); 479: 480: /** 481: * Gurmukhi. 482: * 0x0A00 - 0x0A7F. 483: */ 484: public static final UnicodeBlock GURMUKHI 485: = new UnicodeBlock(0x0A00, 0x0A7F, 486: "GURMUKHI", 487: "Gurmukhi"); 488: 489: /** 490: * Gujarati. 491: * 0x0A80 - 0x0AFF. 492: */ 493: public static final UnicodeBlock GUJARATI 494: = new UnicodeBlock(0x0A80, 0x0AFF, 495: "GUJARATI", 496: "Gujarati"); 497: 498: /** 499: * Oriya. 500: * 0x0B00 - 0x0B7F. 501: */ 502: public static final UnicodeBlock ORIYA 503: = new UnicodeBlock(0x0B00, 0x0B7F, 504: "ORIYA", 505: "Oriya"); 506: 507: /** 508: * Tamil. 509: * 0x0B80 - 0x0BFF. 510: */ 511: public static final UnicodeBlock TAMIL 512: = new UnicodeBlock(0x0B80, 0x0BFF, 513: "TAMIL", 514: "Tamil"); 515: 516: /** 517: * Telugu. 518: * 0x0C00 - 0x0C7F. 519: */ 520: public static final UnicodeBlock TELUGU 521: = new UnicodeBlock(0x0C00, 0x0C7F, 522: "TELUGU", 523: "Telugu"); 524: 525: /** 526: * Kannada. 527: * 0x0C80 - 0x0CFF. 528: */ 529: public static final UnicodeBlock KANNADA 530: = new UnicodeBlock(0x0C80, 0x0CFF, 531: "KANNADA", 532: "Kannada"); 533: 534: /** 535: * Malayalam. 536: * 0x0D00 - 0x0D7F. 537: */ 538: public static final UnicodeBlock MALAYALAM 539: = new UnicodeBlock(0x0D00, 0x0D7F, 540: "MALAYALAM", 541: "Malayalam"); 542: 543: /** 544: * Sinhala. 545: * 0x0D80 - 0x0DFF. 546: * @since 1.4 547: */ 548: public static final UnicodeBlock SINHALA 549: = new UnicodeBlock(0x0D80, 0x0DFF, 550: "SINHALA", 551: "Sinhala"); 552: 553: /** 554: * Thai. 555: * 0x0E00 - 0x0E7F. 556: */ 557: public static final UnicodeBlock THAI 558: = new UnicodeBlock(0x0E00, 0x0E7F, 559: "THAI", 560: "Thai"); 561: 562: /** 563: * Lao. 564: * 0x0E80 - 0x0EFF. 565: */ 566: public static final UnicodeBlock LAO 567: = new UnicodeBlock(0x0E80, 0x0EFF, 568: "LAO", 569: "Lao"); 570: 571: /** 572: * Tibetan. 573: * 0x0F00 - 0x0FFF. 574: */ 575: public static final UnicodeBlock TIBETAN 576: = new UnicodeBlock(0x0F00, 0x0FFF, 577: "TIBETAN", 578: "Tibetan"); 579: 580: /** 581: * Myanmar. 582: * 0x1000 - 0x109F. 583: * @since 1.4 584: */ 585: public static final UnicodeBlock MYANMAR 586: = new UnicodeBlock(0x1000, 0x109F, 587: "MYANMAR", 588: "Myanmar"); 589: 590: /** 591: * Georgian. 592: * 0x10A0 - 0x10FF. 593: */ 594: public static final UnicodeBlock GEORGIAN 595: = new UnicodeBlock(0x10A0, 0x10FF, 596: "GEORGIAN", 597: "Georgian"); 598: 599: /** 600: * Hangul Jamo. 601: * 0x1100 - 0x11FF. 602: */ 603: public static final UnicodeBlock HANGUL_JAMO 604: = new UnicodeBlock(0x1100, 0x11FF, 605: "HANGUL_JAMO", 606: "Hangul Jamo"); 607: 608: /** 609: * Ethiopic. 610: * 0x1200 - 0x137F. 611: * @since 1.4 612: */ 613: public static final UnicodeBlock ETHIOPIC 614: = new UnicodeBlock(0x1200, 0x137F, 615: "ETHIOPIC", 616: "Ethiopic"); 617: 618: /** 619: * Cherokee. 620: * 0x13A0 - 0x13FF. 621: * @since 1.4 622: */ 623: public static final UnicodeBlock CHEROKEE 624: = new UnicodeBlock(0x13A0, 0x13FF, 625: "CHEROKEE", 626: "Cherokee"); 627: 628: /** 629: * Unified Canadian Aboriginal Syllabics. 630: * 0x1400 - 0x167F. 631: * @since 1.4 632: */ 633: public static final UnicodeBlock UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS 634: = new UnicodeBlock(0x1400, 0x167F, 635: "UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS", 636: "Unified Canadian Aboriginal Syllabics"); 637: 638: /** 639: * Ogham. 640: * 0x1680 - 0x169F. 641: * @since 1.4 642: */ 643: public static final UnicodeBlock OGHAM 644: = new UnicodeBlock(0x1680, 0x169F, 645: "OGHAM", 646: "Ogham"); 647: 648: /** 649: * Runic. 650: * 0x16A0 - 0x16FF. 651: * @since 1.4 652: */ 653: public static final UnicodeBlock RUNIC 654: = new UnicodeBlock(0x16A0, 0x16FF, 655: "RUNIC", 656: "Runic"); 657: 658: /** 659: * Tagalog. 660: * 0x1700 - 0x171F. 661: * @since 1.5 662: */ 663: public static final UnicodeBlock TAGALOG 664: = new UnicodeBlock(0x1700, 0x171F, 665: "TAGALOG", 666: "Tagalog"); 667: 668: /** 669: * Hanunoo. 670: * 0x1720 - 0x173F. 671: * @since 1.5 672: */ 673: public static final UnicodeBlock HANUNOO 674: = new UnicodeBlock(0x1720, 0x173F, 675: "HANUNOO", 676: "Hanunoo"); 677: 678: /** 679: * Buhid. 680: * 0x1740 - 0x175F. 681: * @since 1.5 682: */ 683: public static final UnicodeBlock BUHID 684: = new UnicodeBlock(0x1740, 0x175F, 685: "BUHID", 686: "Buhid"); 687: 688: /** 689: * Tagbanwa. 690: * 0x1760 - 0x177F. 691: * @since 1.5 692: */ 693: public static final UnicodeBlock TAGBANWA 694: = new UnicodeBlock(0x1760, 0x177F, 695: "TAGBANWA", 696: "Tagbanwa"); 697: 698: /** 699: * Khmer. 700: * 0x1780 - 0x17FF. 701: * @since 1.4 702: */ 703: public static final UnicodeBlock KHMER 704: = new UnicodeBlock(0x1780, 0x17FF, 705: "KHMER", 706: "Khmer"); 707: 708: /** 709: * Mongolian. 710: * 0x1800 - 0x18AF. 711: * @since 1.4 712: */ 713: public static final UnicodeBlock MONGOLIAN 714: = new UnicodeBlock(0x1800, 0x18AF, 715: "MONGOLIAN", 716: "Mongolian"); 717: 718: /** 719: * Limbu. 720: * 0x1900 - 0x194F. 721: * @since 1.5 722: */ 723: public static final UnicodeBlock LIMBU 724: = new UnicodeBlock(0x1900, 0x194F, 725: "LIMBU", 726: "Limbu"); 727: 728: /** 729: * Tai Le. 730: * 0x1950 - 0x197F. 731: * @since 1.5 732: */ 733: public static final UnicodeBlock TAI_LE 734: = new UnicodeBlock(0x1950, 0x197F, 735: "TAI_LE", 736: "Tai Le"); 737: 738: /** 739: * Khmer Symbols. 740: * 0x19E0 - 0x19FF. 741: * @since 1.5 742: */ 743: public static final UnicodeBlock KHMER_SYMBOLS 744: = new UnicodeBlock(0x19E0, 0x19FF, 745: "KHMER_SYMBOLS", 746: "Khmer Symbols"); 747: 748: /** 749: * Phonetic Extensions. 750: * 0x1D00 - 0x1D7F. 751: * @since 1.5 752: */ 753: public static final UnicodeBlock PHONETIC_EXTENSIONS 754: = new UnicodeBlock(0x1D00, 0x1D7F, 755: "PHONETIC_EXTENSIONS", 756: "Phonetic Extensions"); 757: 758: /** 759: * Latin Extended Additional. 760: * 0x1E00 - 0x1EFF. 761: */ 762: public static final UnicodeBlock LATIN_EXTENDED_ADDITIONAL 763: = new UnicodeBlock(0x1E00, 0x1EFF, 764: "LATIN_EXTENDED_ADDITIONAL", 765: "Latin Extended Additional"); 766: 767: /** 768: * Greek Extended. 769: * 0x1F00 - 0x1FFF. 770: */ 771: public static final UnicodeBlock GREEK_EXTENDED 772: = new UnicodeBlock(0x1F00, 0x1FFF, 773: "GREEK_EXTENDED", 774: "Greek Extended"); 775: 776: /** 777: * General Punctuation. 778: * 0x2000 - 0x206F. 779: */ 780: public static final UnicodeBlock GENERAL_PUNCTUATION 781: = new UnicodeBlock(0x2000, 0x206F, 782: "GENERAL_PUNCTUATION", 783: "General Punctuation"); 784: 785: /** 786: * Superscripts and Subscripts. 787: * 0x2070 - 0x209F. 788: */ 789: public static final UnicodeBlock SUPERSCRIPTS_AND_SUBSCRIPTS 790: = new UnicodeBlock(0x2070, 0x209F, 791: "SUPERSCRIPTS_AND_SUBSCRIPTS", 792: "Superscripts and Subscripts"); 793: 794: /** 795: * Currency Symbols. 796: * 0x20A0 - 0x20CF. 797: */ 798: public static final UnicodeBlock CURRENCY_SYMBOLS 799: = new UnicodeBlock(0x20A0, 0x20CF, 800: "CURRENCY_SYMBOLS", 801: "Currency Symbols"); 802: 803: /** 804: * Combining Marks for Symbols. 805: * 0x20D0 - 0x20FF. 806: */ 807: public static final UnicodeBlock COMBINING_MARKS_FOR_SYMBOLS 808: = new UnicodeBlock(0x20D0, 0x20FF, 809: "COMBINING_MARKS_FOR_SYMBOLS", 810: "Combining Marks for Symbols"); 811: 812: /** 813: * Letterlike Symbols. 814: * 0x2100 - 0x214F. 815: */ 816: public static final UnicodeBlock LETTERLIKE_SYMBOLS 817: = new UnicodeBlock(0x2100, 0x214F, 818: "LETTERLIKE_SYMBOLS", 819: "Letterlike Symbols"); 820: 821: /** 822: * Number Forms. 823: * 0x2150 - 0x218F. 824: */ 825: public static final UnicodeBlock NUMBER_FORMS 826: = new UnicodeBlock(0x2150, 0x218F, 827: "NUMBER_FORMS", 828: "Number Forms"); 829: 830: /** 831: * Arrows. 832: * 0x2190 - 0x21FF. 833: */ 834: public static final UnicodeBlock ARROWS 835: = new UnicodeBlock(0x2190, 0x21FF, 836: "ARROWS", 837: "Arrows"); 838: 839: /** 840: * Mathematical Operators. 841: * 0x2200 - 0x22FF. 842: */ 843: public static final UnicodeBlock MATHEMATICAL_OPERATORS 844: = new UnicodeBlock(0x2200, 0x22FF, 845: "MATHEMATICAL_OPERATORS", 846: "Mathematical Operators"); 847: 848: /** 849: * Miscellaneous Technical. 850: * 0x2300 - 0x23FF. 851: */ 852: public static final UnicodeBlock MISCELLANEOUS_TECHNICAL 853: = new UnicodeBlock(0x2300, 0x23FF, 854: "MISCELLANEOUS_TECHNICAL", 855: "Miscellaneous Technical"); 856: 857: /** 858: * Control Pictures. 859: * 0x2400 - 0x243F. 860: */ 861: public static final UnicodeBlock CONTROL_PICTURES 862: = new UnicodeBlock(0x2400, 0x243F, 863: "CONTROL_PICTURES", 864: "Control Pictures"); 865: 866: /** 867: * Optical Character Recognition. 868: * 0x2440 - 0x245F. 869: */ 870: public static final UnicodeBlock OPTICAL_CHARACTER_RECOGNITION 871: = new UnicodeBlock(0x2440, 0x245F, 872: "OPTICAL_CHARACTER_RECOGNITION", 873: "Optical Character Recognition"); 874: 875: /** 876: * Enclosed Alphanumerics. 877: * 0x2460 - 0x24FF. 878: */ 879: public static final UnicodeBlock ENCLOSED_ALPHANUMERICS 880: = new UnicodeBlock(0x2460, 0x24FF, 881: "ENCLOSED_ALPHANUMERICS", 882: "Enclosed Alphanumerics"); 883: 884: /** 885: * Box Drawing. 886: * 0x2500 - 0x257F. 887: */ 888: public static final UnicodeBlock BOX_DRAWING 889: = new UnicodeBlock(0x2500, 0x257F, 890: "BOX_DRAWING", 891: "Box Drawing"); 892: 893: /** 894: * Block Elements. 895: * 0x2580 - 0x259F. 896: */ 897: public static final UnicodeBlock BLOCK_ELEMENTS 898: = new UnicodeBlock(0x2580, 0x259F, 899: "BLOCK_ELEMENTS", 900: "Block Elements"); 901: 902: /** 903: * Geometric Shapes. 904: * 0x25A0 - 0x25FF. 905: */ 906: public static final UnicodeBlock GEOMETRIC_SHAPES 907: = new UnicodeBlock(0x25A0, 0x25FF, 908: "GEOMETRIC_SHAPES", 909: "Geometric Shapes"); 910: 911: /** 912: * Miscellaneous Symbols. 913: * 0x2600 - 0x26FF. 914: */ 915: public static final UnicodeBlock MISCELLANEOUS_SYMBOLS 916: = new UnicodeBlock(0x2600, 0x26FF, 917: "MISCELLANEOUS_SYMBOLS", 918: "Miscellaneous Symbols"); 919: 920: /** 921: * Dingbats. 922: * 0x2700 - 0x27BF. 923: */ 924: public static final UnicodeBlock DINGBATS 925: = new UnicodeBlock(0x2700, 0x27BF, 926: "DINGBATS", 927: "Dingbats"); 928: 929: /** 930: * Miscellaneous Mathematical Symbols-A. 931: * 0x27C0 - 0x27EF. 932: * @since 1.5 933: */ 934: public static final UnicodeBlock MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A 935: = new UnicodeBlock(0x27C0, 0x27EF, 936: "MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A", 937: "Miscellaneous Mathematical Symbols-A"); 938: 939: /** 940: * Supplemental Arrows-A. 941: * 0x27F0 - 0x27FF. 942: * @since 1.5 943: */ 944: public static final UnicodeBlock SUPPLEMENTAL_ARROWS_A 945: = new UnicodeBlock(0x27F0, 0x27FF, 946: "SUPPLEMENTAL_ARROWS_A", 947: "Supplemental Arrows-A"); 948: 949: /** 950: * Braille Patterns. 951: * 0x2800 - 0x28FF. 952: * @since 1.4 953: */ 954: public static final UnicodeBlock BRAILLE_PATTERNS 955: = new UnicodeBlock(0x2800, 0x28FF, 956: "BRAILLE_PATTERNS", 957: "Braille Patterns"); 958: 959: /** 960: * Supplemental Arrows-B. 961: * 0x2900 - 0x297F. 962: * @since 1.5 963: */ 964: public static final UnicodeBlock SUPPLEMENTAL_ARROWS_B 965: = new UnicodeBlock(0x2900, 0x297F, 966: "SUPPLEMENTAL_ARROWS_B", 967: "Supplemental Arrows-B"); 968: 969: /** 970: * Miscellaneous Mathematical Symbols-B. 971: * 0x2980 - 0x29FF. 972: * @since 1.5 973: */ 974: public static final UnicodeBlock MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B 975: = new UnicodeBlock(0x2980, 0x29FF, 976: "MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B", 977: "Miscellaneous Mathematical Symbols-B"); 978: 979: /** 980: * Supplemental Mathematical Operators. 981: * 0x2A00 - 0x2AFF. 982: * @since 1.5 983: */ 984: public static final UnicodeBlock SUPPLEMENTAL_MATHEMATICAL_OPERATORS 985: = new UnicodeBlock(0x2A00, 0x2AFF, 986: "SUPPLEMENTAL_MATHEMATICAL_OPERATORS", 987: "Supplemental Mathematical Operators"); 988: 989: /** 990: * Miscellaneous Symbols and Arrows. 991: * 0x2B00 - 0x2BFF. 992: * @since 1.5 993: */ 994: public static final UnicodeBlock MISCELLANEOUS_SYMBOLS_AND_ARROWS 995: = new UnicodeBlock(0x2B00, 0x2BFF, 996: "MISCELLANEOUS_SYMBOLS_AND_ARROWS", 997: "Miscellaneous Symbols and Arrows"); 998: 999: /** 1000: * CJK Radicals Supplement. 1001: * 0x2E80 - 0x2EFF. 1002: * @since 1.4 1003: */ 1004: public static final UnicodeBlock CJK_RADICALS_SUPPLEMENT 1005: = new UnicodeBlock(0x2E80, 0x2EFF, 1006: "CJK_RADICALS_SUPPLEMENT", 1007: "CJK Radicals Supplement"); 1008: 1009: /** 1010: * Kangxi Radicals. 1011: * 0x2F00 - 0x2FDF. 1012: * @since 1.4 1013: */ 1014: public static final UnicodeBlock KANGXI_RADICALS 1015: = new UnicodeBlock(0x2F00, 0x2FDF, 1016: "KANGXI_RADICALS", 1017: "Kangxi Radicals"); 1018: 1019: /** 1020: * Ideographic Description Characters. 1021: * 0x2FF0 - 0x2FFF. 1022: * @since 1.4 1023: */ 1024: public static final UnicodeBlock IDEOGRAPHIC_DESCRIPTION_CHARACTERS 1025: = new UnicodeBlock(0x2FF0, 0x2FFF, 1026: "IDEOGRAPHIC_DESCRIPTION_CHARACTERS", 1027: "Ideographic Description Characters"); 1028: 1029: /** 1030: * CJK Symbols and Punctuation. 1031: * 0x3000 - 0x303F. 1032: */ 1033: public static final UnicodeBlock CJK_SYMBOLS_AND_PUNCTUATION 1034: = new UnicodeBlock(0x3000, 0x303F, 1035: "CJK_SYMBOLS_AND_PUNCTUATION", 1036: "CJK Symbols and Punctuation"); 1037: 1038: /** 1039: * Hiragana. 1040: * 0x3040 - 0x309F. 1041: */ 1042: public static final UnicodeBlock HIRAGANA 1043: = new UnicodeBlock(0x3040, 0x309F, 1044: "HIRAGANA", 1045: "Hiragana"); 1046: 1047: /** 1048: * Katakana. 1049: * 0x30A0 - 0x30FF. 1050: */ 1051: public static final UnicodeBlock KATAKANA 1052: = new UnicodeBlock(0x30A0, 0x30FF, 1053: "KATAKANA", 1054: "Katakana"); 1055: 1056: /** 1057: * Bopomofo. 1058: * 0x3100 - 0x312F. 1059: */ 1060: public static final UnicodeBlock BOPOMOFO 1061: = new UnicodeBlock(0x3100, 0x312F, 1062: "BOPOMOFO", 1063: "Bopomofo"); 1064: 1065: /** 1066: * Hangul Compatibility Jamo. 1067: * 0x3130 - 0x318F. 1068: */ 1069: public static final UnicodeBlock HANGUL_COMPATIBILITY_JAMO 1070: = new UnicodeBlock(0x3130, 0x318F, 1071: "HANGUL_COMPATIBILITY_JAMO", 1072: "Hangul Compatibility Jamo"); 1073: 1074: /** 1075: * Kanbun. 1076: * 0x3190 - 0x319F. 1077: */ 1078: public static final UnicodeBlock KANBUN 1079: = new UnicodeBlock(0x3190, 0x319F, 1080: "KANBUN", 1081: "Kanbun"); 1082: 1083: /** 1084: * Bopomofo Extended. 1085: * 0x31A0 - 0x31BF. 1086: * @since 1.4 1087: */ 1088: public static final UnicodeBlock BOPOMOFO_EXTENDED 1089: = new UnicodeBlock(0x31A0, 0x31BF, 1090: "BOPOMOFO_EXTENDED", 1091: "Bopomofo Extended"); 1092: 1093: /** 1094: * Katakana Phonetic Extensions. 1095: * 0x31F0 - 0x31FF. 1096: * @since 1.5 1097: */ 1098: public static final UnicodeBlock KATAKANA_PHONETIC_EXTENSIONS 1099: = new UnicodeBlock(0x31F0, 0x31FF, 1100: "KATAKANA_PHONETIC_EXTENSIONS", 1101: "Katakana Phonetic Extensions"); 1102: 1103: /** 1104: * Enclosed CJK Letters and Months. 1105: * 0x3200 - 0x32FF. 1106: */ 1107: public static final UnicodeBlock ENCLOSED_CJK_LETTERS_AND_MONTHS 1108: = new UnicodeBlock(0x3200, 0x32FF, 1109: "ENCLOSED_CJK_LETTERS_AND_MONTHS", 1110: "Enclosed CJK Letters and Months"); 1111: 1112: /** 1113: * CJK Compatibility. 1114: * 0x3300 - 0x33FF. 1115: */ 1116: public static final UnicodeBlock CJK_COMPATIBILITY 1117: = new UnicodeBlock(0x3300, 0x33FF, 1118: "CJK_COMPATIBILITY", 1119: "CJK Compatibility"); 1120: 1121: /** 1122: * CJK Unified Ideographs Extension A. 1123: * 0x3400 - 0x4DBF. 1124: * @since 1.4 1125: */ 1126: public static final UnicodeBlock CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A 1127: = new UnicodeBlock(0x3400, 0x4DBF, 1128: "CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A", 1129: "CJK Unified Ideographs Extension A"); 1130: 1131: /** 1132: * Yijing Hexagram Symbols. 1133: * 0x4DC0 - 0x4DFF. 1134: * @since 1.5 1135: */ 1136: public static final UnicodeBlock YIJING_HEXAGRAM_SYMBOLS 1137: = new UnicodeBlock(0x4DC0, 0x4DFF, 1138: "YIJING_HEXAGRAM_SYMBOLS", 1139: "Yijing Hexagram Symbols"); 1140: 1141: /** 1142: * CJK Unified Ideographs. 1143: * 0x4E00 - 0x9FFF. 1144: */ 1145: public static final UnicodeBlock CJK_UNIFIED_IDEOGRAPHS 1146: = new UnicodeBlock(0x4E00, 0x9FFF, 1147: "CJK_UNIFIED_IDEOGRAPHS", 1148: "CJK Unified Ideographs"); 1149: 1150: /** 1151: * Yi Syllables. 1152: * 0xA000 - 0xA48F. 1153: * @since 1.4 1154: */ 1155: public static final UnicodeBlock YI_SYLLABLES 1156: = new UnicodeBlock(0xA000, 0xA48F, 1157: "YI_SYLLABLES", 1158: "Yi Syllables"); 1159: 1160: /** 1161: * Yi Radicals. 1162: * 0xA490 - 0xA4CF. 1163: * @since 1.4 1164: */ 1165: public static final UnicodeBlock YI_RADICALS 1166: = new UnicodeBlock(0xA490, 0xA4CF, 1167: "YI_RADICALS", 1168: "Yi Radicals"); 1169: 1170: /** 1171: * Hangul Syllables. 1172: * 0xAC00 - 0xD7AF. 1173: */ 1174: public static final UnicodeBlock HANGUL_SYLLABLES 1175: = new UnicodeBlock(0xAC00, 0xD7AF, 1176: "HANGUL_SYLLABLES", 1177: "Hangul Syllables"); 1178: 1179: /** 1180: * High Surrogates. 1181: * 0xD800 - 0xDB7F. 1182: * @since 1.5 1183: */ 1184: public static final UnicodeBlock HIGH_SURROGATES 1185: = new UnicodeBlock(0xD800, 0xDB7F, 1186: "HIGH_SURROGATES", 1187: "High Surrogates"); 1188: 1189: /** 1190: * High Private Use Surrogates. 1191: * 0xDB80 - 0xDBFF. 1192: * @since 1.5 1193: */ 1194: public static final UnicodeBlock HIGH_PRIVATE_USE_SURROGATES 1195: = new UnicodeBlock(0xDB80, 0xDBFF, 1196: "HIGH_PRIVATE_USE_SURROGATES", 1197: "High Private Use Surrogates"); 1198: 1199: /** 1200: * Low Surrogates. 1201: * 0xDC00 - 0xDFFF. 1202: * @since 1.5 1203: */ 1204: public static final UnicodeBlock LOW_SURROGATES 1205: = new UnicodeBlock(0xDC00, 0xDFFF, 1206: "LOW_SURROGATES", 1207: "Low Surrogates"); 1208: 1209: /** 1210: * Private Use Area. 1211: * 0xE000 - 0xF8FF. 1212: */ 1213: public static final UnicodeBlock PRIVATE_USE_AREA 1214: = new UnicodeBlock(0xE000, 0xF8FF, 1215: "PRIVATE_USE_AREA", 1216: "Private Use Area"); 1217: 1218: /** 1219: * CJK Compatibility Ideographs. 1220: * 0xF900 - 0xFAFF. 1221: */ 1222: public static final UnicodeBlock CJK_COMPATIBILITY_IDEOGRAPHS 1223: = new UnicodeBlock(0xF900, 0xFAFF, 1224: "CJK_COMPATIBILITY_IDEOGRAPHS", 1225: "CJK Compatibility Ideographs"); 1226: 1227: /** 1228: * Alphabetic Presentation Forms. 1229: * 0xFB00 - 0xFB4F. 1230: */ 1231: public static final UnicodeBlock ALPHABETIC_PRESENTATION_FORMS 1232: = new UnicodeBlock(0xFB00, 0xFB4F, 1233: "ALPHABETIC_PRESENTATION_FORMS", 1234: "Alphabetic Presentation Forms"); 1235: 1236: /** 1237: * Arabic Presentation Forms-A. 1238: * 0xFB50 - 0xFDFF. 1239: */ 1240: public static final UnicodeBlock ARABIC_PRESENTATION_FORMS_A 1241: = new UnicodeBlock(0xFB50, 0xFDFF, 1242: "ARABIC_PRESENTATION_FORMS_A", 1243: "Arabic Presentation Forms-A"); 1244: 1245: /** 1246: * Variation Selectors. 1247: * 0xFE00 - 0xFE0F. 1248: * @since 1.5 1249: */ 1250: public static final UnicodeBlock VARIATION_SELECTORS 1251: = new UnicodeBlock(0xFE00, 0xFE0F, 1252: "VARIATION_SELECTORS", 1253: "Variation Selectors"); 1254: 1255: /** 1256: * Combining Half Marks. 1257: * 0xFE20 - 0xFE2F. 1258: */ 1259: public static final UnicodeBlock COMBINING_HALF_MARKS 1260: = new UnicodeBlock(0xFE20, 0xFE2F, 1261: "COMBINING_HALF_MARKS", 1262: "Combining Half Marks"); 1263: 1264: /** 1265: * CJK Compatibility Forms. 1266: * 0xFE30 - 0xFE4F. 1267: */ 1268: public static final UnicodeBlock CJK_COMPATIBILITY_FORMS 1269: = new UnicodeBlock(0xFE30, 0xFE4F, 1270: "CJK_COMPATIBILITY_FORMS", 1271: "CJK Compatibility Forms"); 1272: 1273: /** 1274: * Small Form Variants. 1275: * 0xFE50 - 0xFE6F. 1276: */ 1277: public static final UnicodeBlock SMALL_FORM_VARIANTS 1278: = new UnicodeBlock(0xFE50, 0xFE6F, 1279: "SMALL_FORM_VARIANTS", 1280: "Small Form Variants"); 1281: 1282: /** 1283: * Arabic Presentation Forms-B. 1284: * 0xFE70 - 0xFEFF. 1285: */ 1286: public static final UnicodeBlock ARABIC_PRESENTATION_FORMS_B 1287: = new UnicodeBlock(0xFE70, 0xFEFF, 1288: "ARABIC_PRESENTATION_FORMS_B", 1289: "Arabic Presentation Forms-B"); 1290: 1291: /** 1292: * Halfwidth and Fullwidth Forms. 1293: * 0xFF00 - 0xFFEF. 1294: */ 1295: public static final UnicodeBlock HALFWIDTH_AND_FULLWIDTH_FORMS 1296: = new UnicodeBlock(0xFF00, 0xFFEF, 1297: "HALFWIDTH_AND_FULLWIDTH_FORMS", 1298: "Halfwidth and Fullwidth Forms"); 1299: 1300: /** 1301: * Specials. 1302: * 0xFFF0 - 0xFFFF. 1303: */ 1304: public static final UnicodeBlock SPECIALS 1305: = new UnicodeBlock(0xFFF0, 0xFFFF, 1306: "SPECIALS", 1307: "Specials"); 1308: 1309: /** 1310: * Linear B Syllabary. 1311: * 0x10000 - 0x1007F. 1312: * @since 1.5 1313: */ 1314: public static final UnicodeBlock LINEAR_B_SYLLABARY 1315: = new UnicodeBlock(0x10000, 0x1007F, 1316: "LINEAR_B_SYLLABARY", 1317: "Linear B Syllabary"); 1318: 1319: /** 1320: * Linear B Ideograms. 1321: * 0x10080 - 0x100FF. 1322: * @since 1.5 1323: */ 1324: public static final UnicodeBlock LINEAR_B_IDEOGRAMS 1325: = new UnicodeBlock(0x10080, 0x100FF, 1326: "LINEAR_B_IDEOGRAMS", 1327: "Linear B Ideograms"); 1328: 1329: /** 1330: * Aegean Numbers. 1331: * 0x10100 - 0x1013F. 1332: * @since 1.5 1333: */ 1334: public static final UnicodeBlock AEGEAN_NUMBERS 1335: = new UnicodeBlock(0x10100, 0x1013F, 1336: "AEGEAN_NUMBERS", 1337: "Aegean Numbers"); 1338: 1339: /** 1340: * Old Italic. 1341: * 0x10300 - 0x1032F. 1342: * @since 1.5 1343: */ 1344: public static final UnicodeBlock OLD_ITALIC 1345: = new UnicodeBlock(0x10300, 0x1032F, 1346: "OLD_ITALIC", 1347: "Old Italic"); 1348: 1349: /** 1350: * Gothic. 1351: * 0x10330 - 0x1034F. 1352: * @since 1.5 1353: */ 1354: public static final UnicodeBlock GOTHIC 1355: = new UnicodeBlock(0x10330, 0x1034F, 1356: "GOTHIC", 1357: "Gothic"); 1358: 1359: /** 1360: * Ugaritic. 1361: * 0x10380 - 0x1039F. 1362: * @since 1.5 1363: */ 1364: public static final UnicodeBlock UGARITIC 1365: = new UnicodeBlock(0x10380, 0x1039F, 1366: "UGARITIC", 1367: "Ugaritic"); 1368: 1369: /** 1370: * Deseret. 1371: * 0x10400 - 0x1044F. 1372: * @since 1.5 1373: */ 1374: public static final UnicodeBlock DESERET 1375: = new UnicodeBlock(0x10400, 0x1044F, 1376: "DESERET", 1377: "Deseret"); 1378: 1379: /** 1380: * Shavian. 1381: * 0x10450 - 0x1047F. 1382: * @since 1.5 1383: */ 1384: public static final UnicodeBlock SHAVIAN 1385: = new UnicodeBlock(0x10450, 0x1047F, 1386: "SHAVIAN", 1387: "Shavian"); 1388: 1389: /** 1390: * Osmanya. 1391: * 0x10480 - 0x104AF. 1392: * @since 1.5 1393: */ 1394: public static final UnicodeBlock OSMANYA 1395: = new UnicodeBlock(0x10480, 0x104AF, 1396: "OSMANYA", 1397: "Osmanya"); 1398: 1399: /** 1400: * Cypriot Syllabary. 1401: * 0x10800 - 0x1083F. 1402: * @since 1.5 1403: */ 1404: public static final UnicodeBlock CYPRIOT_SYLLABARY 1405: = new UnicodeBlock(0x10800, 0x1083F, 1406: "CYPRIOT_SYLLABARY", 1407: "Cypriot Syllabary"); 1408: 1409: /** 1410: * Byzantine Musical Symbols. 1411: * 0x1D000 - 0x1D0FF. 1412: * @since 1.5 1413: */ 1414: public static final UnicodeBlock BYZANTINE_MUSICAL_SYMBOLS 1415: = new UnicodeBlock(0x1D000, 0x1D0FF, 1416: "BYZANTINE_MUSICAL_SYMBOLS", 1417: "Byzantine Musical Symbols"); 1418: 1419: /** 1420: * Musical Symbols. 1421: * 0x1D100 - 0x1D1FF. 1422: * @since 1.5 1423: */ 1424: public static final UnicodeBlock MUSICAL_SYMBOLS 1425: = new UnicodeBlock(0x1D100, 0x1D1FF, 1426: "MUSICAL_SYMBOLS", 1427: "Musical Symbols"); 1428: 1429: /** 1430: * Tai Xuan Jing Symbols. 1431: * 0x1D300 - 0x1D35F. 1432: * @since 1.5 1433: */ 1434: public static final UnicodeBlock TAI_XUAN_JING_SYMBOLS 1435: = new UnicodeBlock(0x1D300, 0x1D35F, 1436: "TAI_XUAN_JING_SYMBOLS", 1437: "Tai Xuan Jing Symbols"); 1438: 1439: /** 1440: * Mathematical Alphanumeric Symbols. 1441: * 0x1D400 - 0x1D7FF. 1442: * @since 1.5 1443: */ 1444: public static final UnicodeBlock MATHEMATICAL_ALPHANUMERIC_SYMBOLS 1445: = new UnicodeBlock(0x1D400, 0x1D7FF, 1446: "MATHEMATICAL_ALPHANUMERIC_SYMBOLS", 1447: "Mathematical Alphanumeric Symbols"); 1448: 1449: /** 1450: * CJK Unified Ideographs Extension B. 1451: * 0x20000 - 0x2A6DF. 1452: * @since 1.5 1453: */ 1454: public static final UnicodeBlock CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B 1455: = new UnicodeBlock(0x20000, 0x2A6DF, 1456: "CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B", 1457: "CJK Unified Ideographs Extension B"); 1458: 1459: /** 1460: * CJK Compatibility Ideographs Supplement. 1461: * 0x2F800 - 0x2FA1F. 1462: * @since 1.5 1463: */ 1464: public static final UnicodeBlock CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT 1465: = new UnicodeBlock(0x2F800, 0x2FA1F, 1466: "CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT", 1467: "CJK Compatibility Ideographs Supplement"); 1468: 1469: /** 1470: * Tags. 1471: * 0xE0000 - 0xE007F. 1472: * @since 1.5 1473: */ 1474: public static final UnicodeBlock TAGS 1475: = new UnicodeBlock(0xE0000, 0xE007F, 1476: "TAGS", 1477: "Tags"); 1478: 1479: /** 1480: * Variation Selectors Supplement. 1481: * 0xE0100 - 0xE01EF. 1482: * @since 1.5 1483: */ 1484: public static final UnicodeBlock VARIATION_SELECTORS_SUPPLEMENT 1485: = new UnicodeBlock(0xE0100, 0xE01EF, 1486: "VARIATION_SELECTORS_SUPPLEMENT", 1487: "Variation Selectors Supplement"); 1488: 1489: /** 1490: * Supplementary Private Use Area-A. 1491: * 0xF0000 - 0xFFFFF. 1492: * @since 1.5 1493: */ 1494: public static final UnicodeBlock SUPPLEMENTARY_PRIVATE_USE_AREA_A 1495: = new UnicodeBlock(0xF0000, 0xFFFFF, 1496: "SUPPLEMENTARY_PRIVATE_USE_AREA_A", 1497: "Supplementary Private Use Area-A"); 1498: 1499: /** 1500: * Supplementary Private Use Area-B. 1501: * 0x100000 - 0x10FFFF. 1502: * @since 1.5 1503: */ 1504: public static final UnicodeBlock SUPPLEMENTARY_PRIVATE_USE_AREA_B 1505: = new UnicodeBlock(0x100000, 0x10FFFF, 1506: "SUPPLEMENTARY_PRIVATE_USE_AREA_B", 1507: "Supplementary Private Use Area-B"); 1508: 1509: /** 1510: * Surrogates Area. 1511: * 'D800' - 'DFFF'. 1512: * @deprecated As of 1.5, the three areas, 1513: * <a href="#HIGH_SURROGATES">HIGH_SURROGATES</a>, 1514: * <a href="#HIGH_PRIVATE_USE_SURROGATES">HIGH_PRIVATE_USE_SURROGATES</a> 1515: * and <a href="#LOW_SURROGATES">LOW_SURROGATES</a>, as defined 1516: * by the Unicode standard, should be used in preference to 1517: * this. These are also returned from calls to <code>of(int)</code> 1518: * and <code>of(char)</code>. 1519: */ 1520: public static final UnicodeBlock SURROGATES_AREA 1521: = new UnicodeBlock(0xD800, 0xDFFF, 1522: "SURROGATES_AREA", 1523: "Surrogates Area"); 1524: 1525: /** 1526: * The defined subsets. 1527: */ 1528: private static final UnicodeBlock sets[] = { 1529: BASIC_LATIN, 1530: LATIN_1_SUPPLEMENT, 1531: LATIN_EXTENDED_A, 1532: LATIN_EXTENDED_B, 1533: IPA_EXTENSIONS, 1534: SPACING_MODIFIER_LETTERS, 1535: COMBINING_DIACRITICAL_MARKS, 1536: GREEK, 1537: CYRILLIC, 1538: CYRILLIC_SUPPLEMENTARY, 1539: ARMENIAN, 1540: HEBREW, 1541: ARABIC, 1542: SYRIAC, 1543: THAANA, 1544: DEVANAGARI, 1545: BENGALI, 1546: GURMUKHI, 1547: GUJARATI, 1548: ORIYA, 1549: TAMIL, 1550: TELUGU, 1551: KANNADA, 1552: MALAYALAM, 1553: SINHALA, 1554: THAI, 1555: LAO, 1556: TIBETAN, 1557: MYANMAR, 1558: GEORGIAN, 1559: HANGUL_JAMO, 1560: ETHIOPIC, 1561: CHEROKEE, 1562: UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS, 1563: OGHAM, 1564: RUNIC, 1565: TAGALOG, 1566: HANUNOO, 1567: BUHID, 1568: TAGBANWA, 1569: KHMER, 1570: MONGOLIAN, 1571: LIMBU, 1572: TAI_LE, 1573: KHMER_SYMBOLS, 1574: PHONETIC_EXTENSIONS, 1575: LATIN_EXTENDED_ADDITIONAL, 1576: GREEK_EXTENDED, 1577: GENERAL_PUNCTUATION, 1578: SUPERSCRIPTS_AND_SUBSCRIPTS, 1579: CURRENCY_SYMBOLS, 1580: COMBINING_MARKS_FOR_SYMBOLS, 1581: LETTERLIKE_SYMBOLS, 1582: NUMBER_FORMS, 1583: ARROWS, 1584: MATHEMATICAL_OPERATORS, 1585: MISCELLANEOUS_TECHNICAL, 1586: CONTROL_PICTURES, 1587: OPTICAL_CHARACTER_RECOGNITION, 1588: ENCLOSED_ALPHANUMERICS, 1589: BOX_DRAWING, 1590: BLOCK_ELEMENTS, 1591: GEOMETRIC_SHAPES, 1592: MISCELLANEOUS_SYMBOLS, 1593: DINGBATS, 1594: MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A, 1595: SUPPLEMENTAL_ARROWS_A, 1596: BRAILLE_PATTERNS, 1597: SUPPLEMENTAL_ARROWS_B, 1598: MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B, 1599: SUPPLEMENTAL_MATHEMATICAL_OPERATORS, 1600: MISCELLANEOUS_SYMBOLS_AND_ARROWS, 1601: CJK_RADICALS_SUPPLEMENT, 1602: KANGXI_RADICALS, 1603: IDEOGRAPHIC_DESCRIPTION_CHARACTERS, 1604: CJK_SYMBOLS_AND_PUNCTUATION, 1605: HIRAGANA, 1606: KATAKANA, 1607: BOPOMOFO, 1608: HANGUL_COMPATIBILITY_JAMO, 1609: KANBUN, 1610: BOPOMOFO_EXTENDED, 1611: KATAKANA_PHONETIC_EXTENSIONS, 1612: ENCLOSED_CJK_LETTERS_AND_MONTHS, 1613: CJK_COMPATIBILITY, 1614: CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A, 1615: YIJING_HEXAGRAM_SYMBOLS, 1616: CJK_UNIFIED_IDEOGRAPHS, 1617: YI_SYLLABLES, 1618: YI_RADICALS, 1619: HANGUL_SYLLABLES, 1620: HIGH_SURROGATES, 1621: HIGH_PRIVATE_USE_SURROGATES, 1622: LOW_SURROGATES, 1623: PRIVATE_USE_AREA, 1624: CJK_COMPATIBILITY_IDEOGRAPHS, 1625: ALPHABETIC_PRESENTATION_FORMS, 1626: ARABIC_PRESENTATION_FORMS_A, 1627: VARIATION_SELECTORS, 1628: COMBINING_HALF_MARKS, 1629: CJK_COMPATIBILITY_FORMS, 1630: SMALL_FORM_VARIANTS, 1631: ARABIC_PRESENTATION_FORMS_B, 1632: HALFWIDTH_AND_FULLWIDTH_FORMS, 1633: SPECIALS, 1634: LINEAR_B_SYLLABARY, 1635: LINEAR_B_IDEOGRAMS, 1636: AEGEAN_NUMBERS, 1637: OLD_ITALIC, 1638: GOTHIC, 1639: UGARITIC, 1640: DESERET, 1641: SHAVIAN, 1642: OSMANYA, 1643: CYPRIOT_SYLLABARY, 1644: BYZANTINE_MUSICAL_SYMBOLS, 1645: MUSICAL_SYMBOLS, 1646: TAI_XUAN_JING_SYMBOLS, 1647: MATHEMATICAL_ALPHANUMERIC_SYMBOLS, 1648: CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B, 1649: CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT, 1650: TAGS, 1651: VARIATION_SELECTORS_SUPPLEMENT, 1652: SUPPLEMENTARY_PRIVATE_USE_AREA_A, 1653: SUPPLEMENTARY_PRIVATE_USE_AREA_B, 1654: }; 1655: } // class UnicodeBlock 1656: 1657: /** 1658: * The immutable value of this Character. 1659: * 1660: * @serial the value of this Character 1661: */ 1662: private final char value; 1663: 1664: /** 1665: * Compatible with JDK 1.0+. 1666: */ 1667: private static final long serialVersionUID = 3786198910865385080L; 1668: 1669: /** 1670: * Smallest value allowed for radix arguments in Java. This value is 2. 1671: * 1672: * @see #digit(char, int) 1673: * @see #forDigit(int, int) 1674: * @see Integer#toString(int, int) 1675: * @see Integer#valueOf(String) 1676: */ 1677: public static final int MIN_RADIX = 2; 1678: 1679: /** 1680: * Largest value allowed for radix arguments in Java. This value is 36. 1681: * 1682: * @see #digit(char, int) 1683: * @see #forDigit(int, int) 1684: * @see Integer#toString(int, int) 1685: * @see Integer#valueOf(String) 1686: */ 1687: public static final int MAX_RADIX = 36; 1688: 1689: /** 1690: * The minimum value the char data type can hold. 1691: * This value is <code>'\\u0000'</code>. 1692: */ 1693: public static final char MIN_VALUE = '\u0000'; 1694: 1695: /** 1696: * The maximum value the char data type can hold. 1697: * This value is <code>'\\uFFFF'</code>. 1698: */ 1699: public static final char MAX_VALUE = '\uFFFF'; 1700: 1701: /** 1702: * Class object representing the primitive char data type. 1703: * 1704: * @since 1.1 1705: */ 1706: public static final Class TYPE = VMClassLoader.getPrimitiveClass('C'); 1707: 1708: /** 1709: * The number of bits needed to represent a <code>char</code>. 1710: * @since 1.5 1711: */ 1712: public static final int SIZE = 16; 1713: 1714: // This caches some Character values, and is used by boxing 1715: // conversions via valueOf(). We must cache at least 0..127; 1716: // this constant controls how much we actually cache. 1717: private static final int MAX_CACHE = 127; 1718: private static Character[] charCache = new Character[MAX_CACHE + 1]; 1719: 1720: /** 1721: * Lu = Letter, Uppercase (Informative). 1722: * 1723: * @since 1.1 1724: */ 1725: public static final byte UPPERCASE_LETTER = 1; 1726: 1727: /** 1728: * Ll = Letter, Lowercase (Informative). 1729: * 1730: * @since 1.1 1731: */ 1732: public static final byte LOWERCASE_LETTER = 2; 1733: 1734: /** 1735: * Lt = Letter, Titlecase (Informative). 1736: * 1737: * @since 1.1 1738: */ 1739: public static final byte TITLECASE_LETTER = 3; 1740: 1741: /** 1742: * Mn = Mark, Non-Spacing (Normative). 1743: * 1744: * @since 1.1 1745: */ 1746: public static final byte NON_SPACING_MARK = 6; 1747: 1748: /** 1749: * Mc = Mark, Spacing Combining (Normative). 1750: * 1751: * @since 1.1 1752: */ 1753: public static final byte COMBINING_SPACING_MARK = 8; 1754: 1755: /** 1756: * Me = Mark, Enclosing (Normative). 1757: * 1758: * @since 1.1 1759: */ 1760: public static final byte ENCLOSING_MARK = 7; 1761: 1762: /** 1763: * Nd = Number, Decimal Digit (Normative). 1764: * 1765: * @since 1.1 1766: */ 1767: public static final byte DECIMAL_DIGIT_NUMBER = 9; 1768: 1769: /** 1770: * Nl = Number, Letter (Normative). 1771: * 1772: * @since 1.1 1773: */ 1774: public static final byte LETTER_NUMBER = 10; 1775: 1776: /** 1777: * No = Number, Other (Normative). 1778: * 1779: * @since 1.1 1780: */ 1781: public static final byte OTHER_NUMBER = 11; 1782: 1783: /** 1784: * Zs = Separator, Space (Normative). 1785: * 1786: * @since 1.1 1787: */ 1788: public static final byte SPACE_SEPARATOR = 12; 1789: 1790: /** 1791: * Zl = Separator, Line (Normative). 1792: * 1793: * @since 1.1 1794: */ 1795: public static final byte LINE_SEPARATOR = 13; 1796: 1797: /** 1798: * Zp = Separator, Paragraph (Normative). 1799: * 1800: * @since 1.1 1801: */ 1802: public static final byte PARAGRAPH_SEPARATOR = 14; 1803: 1804: /** 1805: * Cc = Other, Control (Normative). 1806: * 1807: * @since 1.1 1808: */ 1809: public static final byte CONTROL = 15; 1810: 1811: /** 1812: * Cf = Other, Format (Normative). 1813: * 1814: * @since 1.1 1815: */ 1816: public static final byte FORMAT = 16; 1817: 1818: /** 1819: * Cs = Other, Surrogate (Normative). 1820: * 1821: * @since 1.1 1822: */ 1823: public static final byte SURROGATE = 19; 1824: 1825: /** 1826: * Co = Other, Private Use (Normative). 1827: * 1828: * @since 1.1 1829: */ 1830: public static final byte PRIVATE_USE = 18; 1831: 1832: /** 1833: * Cn = Other, Not Assigned (Normative). 1834: * 1835: * @since 1.1 1836: */ 1837: public static final byte UNASSIGNED = 0; 1838: 1839: /** 1840: * Lm = Letter, Modifier (Informative). 1841: * 1842: * @since 1.1 1843: */ 1844: public static final byte MODIFIER_LETTER = 4; 1845: 1846: /** 1847: * Lo = Letter, Other (Informative). 1848: * 1849: * @since 1.1 1850: */ 1851: public static final byte OTHER_LETTER = 5; 1852: 1853: /** 1854: * Pc = Punctuation, Connector (Informative). 1855: * 1856: * @since 1.1 1857: */ 1858: public static final byte CONNECTOR_PUNCTUATION = 23; 1859: 1860: /** 1861: * Pd = Punctuation, Dash (Informative). 1862: * 1863: * @since 1.1 1864: */ 1865: public static final byte DASH_PUNCTUATION = 20; 1866: 1867: /** 1868: * Ps = Punctuation, Open (Informative). 1869: * 1870: * @since 1.1 1871: */ 1872: public static final byte START_PUNCTUATION = 21; 1873: 1874: /** 1875: * Pe = Punctuation, Close (Informative). 1876: * 1877: * @since 1.1 1878: */ 1879: public static final byte END_PUNCTUATION = 22; 1880: 1881: /** 1882: * Pi = Punctuation, Initial Quote (Informative). 1883: * 1884: * @since 1.4 1885: */ 1886: public static final byte INITIAL_QUOTE_PUNCTUATION = 29; 1887: 1888: /** 1889: * Pf = Punctuation, Final Quote (Informative). 1890: * 1891: * @since 1.4 1892: */ 1893: public static final byte FINAL_QUOTE_PUNCTUATION = 30; 1894: 1895: /** 1896: * Po = Punctuation, Other (Informative). 1897: * 1898: * @since 1.1 1899: */ 1900: public static final byte OTHER_PUNCTUATION = 24; 1901: 1902: /** 1903: * Sm = Symbol, Math (Informative). 1904: * 1905: * @since 1.1 1906: */ 1907: public static final byte MATH_SYMBOL = 25; 1908: 1909: /** 1910: * Sc = Symbol, Currency (Informative). 1911: * 1912: * @since 1.1 1913: */ 1914: public static final byte CURRENCY_SYMBOL = 26; 1915: 1916: /** 1917: * Sk = Symbol, Modifier (Informative). 1918: * 1919: * @since 1.1 1920: */ 1921: public static final byte MODIFIER_SYMBOL = 27; 1922: 1923: /** 1924: * So = Symbol, Other (Informative). 1925: * 1926: * @since 1.1 1927: */ 1928: public static final byte OTHER_SYMBOL = 28; 1929: 1930: /** 1931: * Undefined bidirectional character type. Undefined char values have 1932: * undefined directionality in the Unicode specification. 1933: * 1934: * @since 1.4 1935: */ 1936: public static final byte DIRECTIONALITY_UNDEFINED = -1; 1937: 1938: /** 1939: * Strong bidirectional character type "L". 1940: * 1941: * @since 1.4 1942: */ 1943: public static final byte DIRECTIONALITY_LEFT_TO_RIGHT = 0; 1944: 1945: /** 1946: * Strong bidirectional character type "R". 1947: * 1948: * @since 1.4 1949: */ 1950: public static final byte DIRECTIONALITY_RIGHT_TO_LEFT = 1; 1951: 1952: /** 1953: * Strong bidirectional character type "AL". 1954: * 1955: * @since 1.4 1956: */ 1957: public static final byte DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC = 2; 1958: 1959: /** 1960: * Weak bidirectional character type "EN". 1961: * 1962: * @since 1.4 1963: */ 1964: public static final byte DIRECTIONALITY_EUROPEAN_NUMBER = 3; 1965: 1966: /** 1967: * Weak bidirectional character type "ES". 1968: * 1969: * @since 1.4 1970: */ 1971: public static final byte DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR = 4; 1972: 1973: /** 1974: * Weak bidirectional character type "ET". 1975: * 1976: * @since 1.4 1977: */ 1978: public static final byte DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR = 5; 1979: 1980: /** 1981: * Weak bidirectional character type "AN". 1982: * 1983: * @since 1.4 1984: */ 1985: public static final byte DIRECTIONALITY_ARABIC_NUMBER = 6; 1986: 1987: /** 1988: * Weak bidirectional character type "CS". 1989: * 1990: * @since 1.4 1991: */ 1992: public static final byte DIRECTIONALITY_COMMON_NUMBER_SEPARATOR = 7; 1993: 1994: /** 1995: * Weak bidirectional character type "NSM". 1996: * 1997: * @since 1.4 1998: */ 1999: public static final byte DIRECTIONALITY_NONSPACING_MARK = 8; 2000: 2001: /** 2002: * Weak bidirectional character type "BN". 2003: * 2004: * @since 1.4 2005: */ 2006: public static final byte DIRECTIONALITY_BOUNDARY_NEUTRAL = 9; 2007: 2008: /** 2009: * Neutral bidirectional character type "B". 2010: * 2011: * @since 1.4 2012: */ 2013: public static final byte DIRECTIONALITY_PARAGRAPH_SEPARATOR = 10; 2014: 2015: /** 2016: * Neutral bidirectional character type "S". 2017: * 2018: * @since 1.4 2019: */ 2020: public static final byte DIRECTIONALITY_SEGMENT_SEPARATOR = 11; 2021: 2022: /** 2023: * Strong bidirectional character type "WS". 2024: * 2025: * @since 1.4 2026: */ 2027: public static final byte DIRECTIONALITY_WHITESPACE = 12; 2028: 2029: /** 2030: * Neutral bidirectional character type "ON". 2031: * 2032: * @since 1.4 2033: */ 2034: public static final byte DIRECTIONALITY_OTHER_NEUTRALS = 13; 2035: 2036: /** 2037: * Strong bidirectional character type "LRE". 2038: * 2039: * @since 1.4 2040: */ 2041: public static final byte DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING = 14; 2042: 2043: /** 2044: * Strong bidirectional character type "LRO". 2045: * 2046: * @since 1.4 2047: */ 2048: public static final byte DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE = 15; 2049: 2050: /** 2051: * Strong bidirectional character type "RLE". 2052: * 2053: * @since 1.4 2054: */ 2055: public static final byte DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING = 16; 2056: 2057: /** 2058: * Strong bidirectional character type "RLO". 2059: * 2060: * @since 1.4 2061: */ 2062: public static final byte DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE = 17; 2063: 2064: /** 2065: * Weak bidirectional character type "PDF". 2066: * 2067: * @since 1.4 2068: */ 2069: public static final byte DIRECTIONALITY_POP_DIRECTIONAL_FORMAT = 18; 2070: 2071: /** 2072: * Mask for grabbing the type out of the result of readChar. 2073: * @see #readChar(char) 2074: */ 2075: private static final int TYPE_MASK = 0x1F; 2076: 2077: /** 2078: * Mask for grabbing the non-breaking space flag out of the result of 2079: * readChar. 2080: * @see #readChar(char) 2081: */ 2082: private static final int NO_BREAK_MASK = 0x20; 2083: 2084: /** 2085: * Mask for grabbing the mirrored directionality flag out of the result 2086: * of readChar. 2087: * @see #readChar(char) 2088: */ 2089: private static final int MIRROR_MASK = 0x40; 2090: 2091: /** 2092: * Min value for supplementary code point. 2093: * 2094: * @since 1.5 2095: */ 2096: public static final int MIN_SUPPLEMENTARY_CODE_POINT = 0x10000; 2097: 2098: /** 2099: * Min value for code point. 2100: * 2101: * @since 1.5 2102: */ 2103: public static final int MIN_CODE_POINT = 0; 2104: 2105: 2106: /** 2107: * Max value for code point. 2108: * 2109: * @since 1.5 2110: */ 2111: public static final int MAX_CODE_POINT = 0x010ffff; 2112: 2113: 2114: /** 2115: * Minimum high surrogate code in UTF-16 encoding. 2116: * 2117: * @since 1.5 2118: */ 2119: public static final char MIN_HIGH_SURROGATE = '\ud800'; 2120: 2121: /** 2122: * Maximum high surrogate code in UTF-16 encoding. 2123: * 2124: * @since 1.5 2125: */ 2126: public static final char MAX_HIGH_SURROGATE = '\udbff'; 2127: 2128: /** 2129: * Minimum low surrogate code in UTF-16 encoding. 2130: * 2131: * @since 1.5 2132: */ 2133: public static final char MIN_LOW_SURROGATE = '\udc00'; 2134: 2135: /** 2136: * Maximum low surrogate code in UTF-16 encoding. 2137: * 2138: * @since 1.5 2139: */ 2140: public static final char MAX_LOW_SURROGATE = '\udfff'; 2141: 2142: /** 2143: * Minimum surrogate code in UTF-16 encoding. 2144: * 2145: * @since 1.5 2146: */ 2147: public static final char MIN_SURROGATE = MIN_HIGH_SURROGATE; 2148: 2149: /** 2150: * Maximum low surrogate code in UTF-16 encoding. 2151: * 2152: * @since 1.5 2153: */ 2154: public static final char MAX_SURROGATE = MAX_LOW_SURROGATE; 2155: 2156: /** 2157: * Grabs an attribute offset from the Unicode attribute database. The lower 2158: * 5 bits are the character type, the next 2 bits are flags, and the top 2159: * 9 bits are the offset into the attribute tables. Note that the top 9 2160: * bits are meaningless in this context; they are useful only in the native 2161: * code. 2162: * 2163: * @param ch the character to look up 2164: * @return the character's attribute offset and type 2165: * @see #TYPE_MASK 2166: * @see #NO_BREAK_MASK 2167: * @see #MIRROR_MASK 2168: */ 2169: private static native char readChar(char ch); 2170: 2171: /** 2172: * Grabs an attribute offset from the Unicode attribute database. The lower 2173: * 5 bits are the character type, the next 2 bits are flags, and the top 2174: * 9 bits are the offset into the attribute tables. Note that the top 9 2175: * bits are meaningless in this context; they are useful only in the native 2176: * code. 2177: * 2178: * @param codePoint the character to look up 2179: * @return the character's attribute offset and type 2180: * @see #TYPE_MASK 2181: * @see #NO_BREAK_MASK 2182: * @see #MIRROR_MASK 2183: */ 2184: private static native char readCodePoint(int codePoint); 2185: 2186: /** 2187: * Wraps up a character. 2188: * 2189: * @param value the character to wrap 2190: */ 2191: public Character(char value) 2192: { 2193: this.value = value; 2194: } 2195: 2196: /** 2197: * Returns the character which has been wrapped by this class. 2198: * 2199: * @return the character wrapped 2200: */ 2201: public char charValue() 2202: { 2203: return value; 2204: } 2205: 2206: /** 2207: * Returns the numerical value (unsigned) of the wrapped character. 2208: * Range of returned values: 0x0000-0xFFFF. 2209: * 2210: * @return the value of the wrapped character 2211: */ 2212: public int hashCode() 2213: { 2214: return value; 2215: } 2216: 2217: /** 2218: * Determines if an object is equal to this object. This is only true for 2219: * another Character object wrapping the same value. 2220: * 2221: * @param o object to compare 2222: * @return true if o is a Character with the same value 2223: */ 2224: public boolean equals(Object o) 2225: { 2226: return o instanceof Character && value == ((Character) o).value; 2227: } 2228: 2229: /** 2230: * Converts the wrapped character into a String. 2231: * 2232: * @return a String containing one character -- the wrapped character 2233: * of this instance 2234: */ 2235: public String toString() 2236: { 2237: // This assumes that String.valueOf(char) can create a single-character 2238: // String more efficiently than through the public API. 2239: return String.valueOf(value); 2240: } 2241: 2242: /** 2243: * Returns a String of length 1 representing the specified character. 2244: * 2245: * @param ch the character to convert 2246: * @return a String containing the character 2247: * @since 1.4 2248: */ 2249: public static String toString(char ch) 2250: { 2251: // This assumes that String.valueOf(char) can create a single-character 2252: // String more efficiently than through the public API. 2253: return String.valueOf(ch); 2254: } 2255: 2256: /** 2257: * Determines if a character is a Unicode lowercase letter. For example, 2258: * <code>'a'</code> is lowercase. 2259: * <br> 2260: * lowercase = [Ll] 2261: * 2262: * @param ch character to test 2263: * @return true if ch is a Unicode lowercase letter, else false 2264: * @see #isUpperCase(char) 2265: * @see #isTitleCase(char) 2266: * @see #toLowerCase(char) 2267: * @see #getType(char) 2268: */ 2269: public static boolean isLowerCase(char ch) 2270: { 2271: return getType(ch) == LOWERCASE_LETTER; 2272: } 2273: 2274: /** 2275: * Determines if a character is a Unicode lowercase letter. For example, 2276: * <code>'a'</code> is lowercase. Unlike isLowerCase(char), this method 2277: * supports supplementary Unicode code points. 2278: * <br> 2279: * lowercase = [Ll] 2280: * 2281: * @param codePoint character to test 2282: * @return true if codePoint is a Unicode lowercase letter, else false 2283: * @see #isUpperCase(int) 2284: * @see #isTitleCase(int) 2285: * @see #toLowerCase(int) 2286: * @see #getType(int) 2287: * @since 1.5 2288: */ 2289: public static boolean isLowerCase(int codePoint) 2290: { 2291: return getType(codePoint) == LOWERCASE_LETTER; 2292: } 2293: 2294: /** 2295: * Determines if a character is a Unicode uppercase letter. For example, 2296: * <code>'A'</code> is uppercase. 2297: * <br> 2298: * uppercase = [Lu] 2299: * 2300: * @param ch character to test 2301: * @return true if ch is a Unicode uppercase letter, else false 2302: * @see #isLowerCase(char) 2303: * @see #isTitleCase(char) 2304: * @see #toUpperCase(char) 2305: * @see #getType(char) 2306: */ 2307: public static boolean isUpperCase(char ch) 2308: { 2309: return getType(ch) == UPPERCASE_LETTER; 2310: } 2311: 2312: /** 2313: * Determines if a character is a Unicode uppercase letter. For example, 2314: * <code>'A'</code> is uppercase. Unlike isUpperCase(char), this method 2315: * supports supplementary Unicode code points. 2316: * <br> 2317: * uppercase = [Lu] 2318: * 2319: * @param codePoint character to test 2320: * @return true if codePoint is a Unicode uppercase letter, else false 2321: * @see #isLowerCase(int) 2322: * @see #isTitleCase(int) 2323: * @see #toUpperCase(int) 2324: * @see #getType(int) 2325: * @since 1.5 2326: */ 2327: public static boolean isUpperCase(int codePoint) 2328: { 2329: return getType(codePoint) == UPPERCASE_LETTER; 2330: } 2331: 2332: /** 2333: * Determines if a character is a Unicode titlecase letter. For example, 2334: * the character "Lj" (Latin capital L with small letter j) is titlecase. 2335: * <br> 2336: * titlecase = [Lt] 2337: * 2338: * @param ch character to test 2339: * @return true if ch is a Unicode titlecase letter, else false 2340: * @see #isLowerCase(char) 2341: * @see #isUpperCase(char) 2342: * @see #toTitleCase(char) 2343: * @see #getType(char) 2344: */ 2345: public static boolean isTitleCase(char ch) 2346: { 2347: return getType(ch) == TITLECASE_LETTER; 2348: } 2349: 2350: /** 2351: * Determines if a character is a Unicode titlecase letter. For example, 2352: * the character "Lj" (Latin capital L with small letter j) is titlecase. 2353: * Unlike isTitleCase(char), this method supports supplementary Unicode 2354: * code points. 2355: * <br> 2356: * titlecase = [Lt] 2357: * 2358: * @param codePoint character to test 2359: * @return true if codePoint is a Unicode titlecase letter, else false 2360: * @see #isLowerCase(int) 2361: * @see #isUpperCase(int) 2362: * @see #toTitleCase(int) 2363: * @see #getType(int) 2364: * @since 1.5 2365: */ 2366: public static boolean isTitleCase(int codePoint) 2367: { 2368: return getType(codePoint) == TITLECASE_LETTER; 2369: } 2370: 2371: /** 2372: * Determines if a character is a Unicode decimal digit. For example, 2373: * <code>'0'</code> is a digit. 2374: * <br> 2375: * Unicode decimal digit = [Nd] 2376: * 2377: * @param ch character to test 2378: * @return true if ch is a Unicode decimal digit, else false 2379: * @see #digit(char, int) 2380: * @see #forDigit(int, int) 2381: * @see #getType(char) 2382: */ 2383: public static boolean isDigit(char ch) 2384: { 2385: return getType(ch) == DECIMAL_DIGIT_NUMBER; 2386: } 2387: 2388: /** 2389: * Determines if a character is a Unicode decimal digit. For example, 2390: * <code>'0'</code> is a digit. Unlike isDigit(char), this method 2391: * supports supplementary Unicode code points. 2392: * <br> 2393: * Unicode decimal digit = [Nd] 2394: * 2395: * @param codePoint character to test 2396: * @return true if ccodePoint is a Unicode decimal digit, else false 2397: * @see #digit(int, int) 2398: * @see #forDigit(int, int) 2399: * @see #getType(int) 2400: * @since 1.5 2401: */ 2402: public static boolean isDigit(int codePoint) 2403: { 2404: return getType(codePoint) == DECIMAL_DIGIT_NUMBER; 2405: } 2406: 2407: /** 2408: * Determines if a character is part of the Unicode Standard. This is an 2409: * evolving standard, but covers every character in the data file. 2410: * <br> 2411: * defined = not [Cn] 2412: * 2413: * @param ch character to test 2414: * @return true if ch is a Unicode character, else false 2415: * @see #isDigit(char) 2416: * @see #isLetter(char) 2417: * @see #isLetterOrDigit(char) 2418: * @see #isLowerCase(char) 2419: * @see #isTitleCase(char) 2420: * @see #isUpperCase(char) 2421: */ 2422: public static boolean isDefined(char ch) 2423: { 2424: return getType(ch) != UNASSIGNED; 2425: } 2426: 2427: /** 2428: * Determines if a character is part of the Unicode Standard. This is an 2429: * evolving standard, but covers every character in the data file. Unlike 2430: * isDefined(char), this method supports supplementary Unicode code points. 2431: * <br> 2432: * defined = not [Cn] 2433: * 2434: * @param codePoint character to test 2435: * @return true if codePoint is a Unicode character, else false 2436: * @see #isDigit(int) 2437: * @see #isLetter(int) 2438: * @see #isLetterOrDigit(int) 2439: * @see #isLowerCase(int) 2440: * @see #isTitleCase(int) 2441: * @see #isUpperCase(int) 2442: * @since 1.5 2443: */ 2444: public static boolean isDefined(int codePoint) 2445: { 2446: return getType(codePoint) != UNASSIGNED; 2447: } 2448: 2449: /** 2450: * Determines if a character is a Unicode letter. Not all letters have case, 2451: * so this may return true when isLowerCase and isUpperCase return false. 2452: * <br> 2453: * letter = [Lu]|[Ll]|[Lt]|[Lm]|[Lo] 2454: * 2455: * @param ch character to test 2456: * @return true if ch is a Unicode letter, else false 2457: * @see #isDigit(char) 2458: * @see #isJavaIdentifierStart(char) 2459: * @see #isJavaLetter(char) 2460: * @see #isJavaLetterOrDigit(char) 2461: * @see #isLetterOrDigit(char) 2462: * @see #isLowerCase(char) 2463: * @see #isTitleCase(char) 2464: * @see #isUnicodeIdentifierStart(char) 2465: * @see #isUpperCase(char) 2466: */ 2467: public static boolean isLetter(char ch) 2468: { 2469: return ((1 << getType(ch)) 2470: & ((1 << UPPERCASE_LETTER) 2471: | (1 << LOWERCASE_LETTER) 2472: | (1 << TITLECASE_LETTER) 2473: | (1 << MODIFIER_LETTER) 2474: | (1 << OTHER_LETTER))) != 0; 2475: } 2476: 2477: /** 2478: * Determines if a character is a Unicode letter. Not all letters have case, 2479: * so this may return true when isLowerCase and isUpperCase return false. 2480: * Unlike isLetter(char), this method supports supplementary Unicode code 2481: * points. 2482: * <br> 2483: * letter = [Lu]|[Ll]|[Lt]|[Lm]|[Lo] 2484: * 2485: * @param codePoint character to test 2486: * @return true if codePoint is a Unicode letter, else false 2487: * @see #isDigit(int) 2488: * @see #isJavaIdentifierStart(int) 2489: * @see #isJavaLetter(int) 2490: * @see #isJavaLetterOrDigit(int) 2491: * @see #isLetterOrDigit(int) 2492: * @see #isLowerCase(int) 2493: * @see #isTitleCase(int) 2494: * @see #isUnicodeIdentifierStart(int) 2495: * @see #isUpperCase(int) 2496: * @since 1.5 2497: */ 2498: public static boolean isLetter(int codePoint) 2499: { 2500: return ((1 << getType(codePoint)) 2501: & ((1 << UPPERCASE_LETTER) 2502: | (1 << LOWERCASE_LETTER) 2503: | (1 << TITLECASE_LETTER) 2504: | (1 << MODIFIER_LETTER) 2505: | (1 << OTHER_LETTER))) != 0; 2506: } 2507: 2508: /** 2509: * Determines if a character is a Unicode letter or a Unicode digit. This 2510: * is the combination of isLetter and isDigit. 2511: * <br> 2512: * letter or digit = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nd] 2513: * 2514: * @param ch character to test 2515: * @return true if ch is a Unicode letter or a Unicode digit, else false 2516: * @see #isDigit(char) 2517: * @see #isJavaIdentifierPart(char) 2518: * @see #isJavaLetter(char) 2519: * @see #isJavaLetterOrDigit(char) 2520: * @see #isLetter(char) 2521: * @see #isUnicodeIdentifierPart(char) 2522: */ 2523: public static boolean isLetterOrDigit(char ch) 2524: { 2525: return ((1 << getType(ch)) 2526: & ((1 << UPPERCASE_LETTER) 2527: | (1 << LOWERCASE_LETTER) 2528: | (1 << TITLECASE_LETTER) 2529: | (1 << MODIFIER_LETTER) 2530: | (1 << OTHER_LETTER) 2531: | (1 << DECIMAL_DIGIT_NUMBER))) != 0; 2532: } 2533: 2534: /** 2535: * Determines if a character is a Unicode letter or a Unicode digit. This 2536: * is the combination of isLetter and isDigit. Unlike isLetterOrDigit(char), 2537: * this method supports supplementary Unicode code points. 2538: * <br> 2539: * letter or digit = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nd] 2540: * 2541: * @param codePoint character to test 2542: * @return true if codePoint is a Unicode letter or a Unicode digit, else false 2543: * @see #isDigit(int) 2544: * @see #isJavaIdentifierPart(int) 2545: * @see #isJavaLetter(int) 2546: * @see #isJavaLetterOrDigit(int) 2547: * @see #isLetter(int) 2548: * @see #isUnicodeIdentifierPart(int) 2549: * @since 1.5 2550: */ 2551: public static boolean isLetterOrDigit(int codePoint) 2552: { 2553: return ((1 << getType(codePoint) 2554: & ((1 << UPPERCASE_LETTER) 2555: | (1 << LOWERCASE_LETTER) 2556: | (1 << TITLECASE_LETTER) 2557: | (1 << MODIFIER_LETTER) 2558: | (1 << OTHER_LETTER) 2559: | (1 << DECIMAL_DIGIT_NUMBER))) != 0); 2560: } 2561: 2562: /** 2563: * Determines if a character can start a Java identifier. This is the 2564: * combination of isLetter, any character where getType returns 2565: * LETTER_NUMBER, currency symbols (like '$'), and connecting punctuation 2566: * (like '_'). 2567: * 2568: * @param ch character to test 2569: * @return true if ch can start a Java identifier, else false 2570: * @deprecated Replaced by {@link #isJavaIdentifierStart(char)} 2571: * @see #isJavaLetterOrDigit(char) 2572: * @see #isJavaIdentifierStart(char) 2573: * @see #isJavaIdentifierPart(char) 2574: * @see #isLetter(char) 2575: * @see #isLetterOrDigit(char) 2576: * @see #isUnicodeIdentifierStart(char) 2577: */ 2578: public static boolean isJavaLetter(char ch) 2579: { 2580: return isJavaIdentifierStart(ch); 2581: } 2582: 2583: /** 2584: * Determines if a character can start a Java identifier. This is the 2585: * combination of isLetter, any character where getType returns 2586: * LETTER_NUMBER, currency symbols (like '$'), and connecting punctuation 2587: * (like '_'). Unlike isJavaIdentifierStart(char), this method supports 2588: * supplementary Unicode code points. 2589: * <br> 2590: * Java identifier start = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]|[Sc]|[Pc] 2591: * 2592: * @param codePoint character to test 2593: * @return true if codePoint can start a Java identifier, else false 2594: * @see #isJavaIdentifierPart(int) 2595: * @see #isLetter(int) 2596: * @see #isUnicodeIdentifierStart(int) 2597: * @since 1.5 2598: */ 2599: public static boolean isJavaIdentifierStart(int codePoint) 2600: { 2601: return ((1 << getType(codePoint)) 2602: & ((1 << UPPERCASE_LETTER) 2603: | (1 << LOWERCASE_LETTER) 2604: | (1 << TITLECASE_LETTER) 2605: | (1 << MODIFIER_LETTER) 2606: | (1 << OTHER_LETTER) 2607: | (1 << LETTER_NUMBER) 2608: | (1 << CURRENCY_SYMBOL) 2609: | (1 << CONNECTOR_PUNCTUATION))) != 0; 2610: } 2611: 2612: /** 2613: * Determines if a character can follow the first letter in 2614: * a Java identifier. This is the combination of isJavaLetter (isLetter, 2615: * type of LETTER_NUMBER, currency, connecting punctuation) and digit, 2616: * numeric letter (like Roman numerals), combining marks, non-spacing marks, 2617: * or isIdentifierIgnorable. 2618: * 2619: * @param ch character to test 2620: * @return true if ch can follow the first letter in a Java identifier 2621: * @deprecated Replaced by {@link #isJavaIdentifierPart(char)} 2622: * @see #isJavaLetter(char) 2623: * @see #isJavaIdentifierStart(char) 2624: * @see #isJavaIdentifierPart(char) 2625: * @see #isLetter(char) 2626: * @see #isLetterOrDigit(char) 2627: * @see #isUnicodeIdentifierPart(char) 2628: * @see #isIdentifierIgnorable(char) 2629: */ 2630: public static boolean isJavaLetterOrDigit(char ch) 2631: { 2632: return isJavaIdentifierPart(ch); 2633: } 2634: 2635: /** 2636: * Determines if a character can start a Java identifier. This is the 2637: * combination of isLetter, any character where getType returns 2638: * LETTER_NUMBER, currency symbols (like '$'), and connecting punctuation 2639: * (like '_'). 2640: * <br> 2641: * Java identifier start = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]|[Sc]|[Pc] 2642: * 2643: * @param ch character to test 2644: * @return true if ch can start a Java identifier, else false 2645: * @see #isJavaIdentifierPart(char) 2646: * @see #isLetter(char) 2647: * @see #isUnicodeIdentifierStart(char) 2648: * @since 1.1 2649: */ 2650: public static boolean isJavaIdentifierStart(char ch) 2651: { 2652: return ((1 << getType(ch)) 2653: & ((1 << UPPERCASE_LETTER) 2654: | (1 << LOWERCASE_LETTER) 2655: | (1 << TITLECASE_LETTER) 2656: | (1 << MODIFIER_LETTER) 2657: | (1 << OTHER_LETTER) 2658: | (1 << LETTER_NUMBER) 2659: | (1 << CURRENCY_SYMBOL) 2660: | (1 << CONNECTOR_PUNCTUATION))) != 0; 2661: } 2662: 2663: /** 2664: * Determines if a character can follow the first letter in 2665: * a Java identifier. This is the combination of isJavaLetter (isLetter, 2666: * type of LETTER_NUMBER, currency, connecting punctuation) and digit, 2667: * numeric letter (like Roman numerals), combining marks, non-spacing marks, 2668: * or isIdentifierIgnorable. 2669: * <br> 2670: * Java identifier extender = 2671: * [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]|[Sc]|[Pc]|[Mn]|[Mc]|[Nd]|[Cf] 2672: * |U+0000-U+0008|U+000E-U+001B|U+007F-U+009F 2673: * 2674: * @param ch character to test 2675: * @return true if ch can follow the first letter in a Java identifier 2676: * @see #isIdentifierIgnorable(char) 2677: * @see #isJavaIdentifierStart(char) 2678: * @see #isLetterOrDigit(char) 2679: * @see #isUnicodeIdentifierPart(char) 2680: * @since 1.1 2681: */ 2682: public static boolean isJavaIdentifierPart(char ch) 2683: { 2684: int category = getType(ch); 2685: return ((1 << category) 2686: & ((1 << UPPERCASE_LETTER) 2687: | (1 << LOWERCASE_LETTER) 2688: | (1 << TITLECASE_LETTER) 2689: | (1 << MODIFIER_LETTER) 2690: | (1 << OTHER_LETTER) 2691: | (1 << NON_SPACING_MARK) 2692: | (1 << COMBINING_SPACING_MARK) 2693: | (1 << DECIMAL_DIGIT_NUMBER) 2694: | (1 << LETTER_NUMBER) 2695: | (1 << CURRENCY_SYMBOL) 2696: | (1 << CONNECTOR_PUNCTUATION) 2697: | (1 << FORMAT))) != 0 2698: || (category == CONTROL && isIdentifierIgnorable(ch)); 2699: } 2700: 2701: /** 2702: * Determines if a character can follow the first letter in 2703: * a Java identifier. This is the combination of isJavaLetter (isLetter, 2704: * type of LETTER_NUMBER, currency, connecting punctuation) and digit, 2705: * numeric letter (like Roman numerals), combining marks, non-spacing marks, 2706: * or isIdentifierIgnorable. Unlike isJavaIdentifierPart(char), this method 2707: * supports supplementary Unicode code points. 2708: * <br> 2709: * Java identifier extender = 2710: * [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]|[Sc]|[Pc]|[Mn]|[Mc]|[Nd]|[Cf] 2711: * |U+0000-U+0008|U+000E-U+001B|U+007F-U+009F 2712: * 2713: * @param codePoint character to test 2714: * @return true if codePoint can follow the first letter in a Java identifier 2715: * @see #isIdentifierIgnorable(int) 2716: * @see #isJavaIdentifierStart(int) 2717: * @see #isLetterOrDigit(int) 2718: * @see #isUnicodeIdentifierPart(int) 2719: * @since 1.5 2720: */ 2721: public static boolean isJavaIdentifierPart(int codePoint) 2722: { 2723: int category = getType(codePoint); 2724: return ((1 << category) 2725: & ((1 << UPPERCASE_LETTER) 2726: | (1 << LOWERCASE_LETTER) 2727: | (1 << TITLECASE_LETTER) 2728: | (1 << MODIFIER_LETTER) 2729: | (1 << OTHER_LETTER) 2730: | (1 << NON_SPACING_MARK) 2731: | (1 << COMBINING_SPACING_MARK) 2732: | (1 << DECIMAL_DIGIT_NUMBER) 2733: | (1 << LETTER_NUMBER) 2734: | (1 << CURRENCY_SYMBOL) 2735: | (1 << CONNECTOR_PUNCTUATION) 2736: | (1 << FORMAT))) != 0 2737: || (category == CONTROL && isIdentifierIgnorable(codePoint)); 2738: } 2739: 2740: /** 2741: * Determines if a character can start a Unicode identifier. Only 2742: * letters can start a Unicode identifier, but this includes characters 2743: * in LETTER_NUMBER. 2744: * <br> 2745: * Unicode identifier start = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl] 2746: * 2747: * @param ch character to test 2748: * @return true if ch can start a Unicode identifier, else false 2749: * @see #isJavaIdentifierStart(char) 2750: * @see #isLetter(char) 2751: * @see #isUnicodeIdentifierPart(char) 2752: * @since 1.1 2753: */ 2754: public static boolean isUnicodeIdentifierStart(char ch) 2755: { 2756: return ((1 << getType(ch)) 2757: & ((1 << UPPERCASE_LETTER) 2758: | (1 << LOWERCASE_LETTER) 2759: | (1 << TITLECASE_LETTER) 2760: | (1 << MODIFIER_LETTER) 2761: | (1 << OTHER_LETTER) 2762: | (1 << LETTER_NUMBER))) != 0; 2763: } 2764: 2765: /** 2766: * Determines if a character can start a Unicode identifier. Only 2767: * letters can start a Unicode identifier, but this includes characters 2768: * in LETTER_NUMBER. Unlike isUnicodeIdentifierStart(char), this method 2769: * supports supplementary Unicode code points. 2770: * <br> 2771: * Unicode identifier start = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl] 2772: * 2773: * @param codePoint character to test 2774: * @return true if codePoint can start a Unicode identifier, else false 2775: * @see #isJavaIdentifierStart(int) 2776: * @see #isLetter(int) 2777: * @see #isUnicodeIdentifierPart(int) 2778: * @since 1.5 2779: */ 2780: public static boolean isUnicodeIdentifierStart(int codePoint) 2781: { 2782: return ((1 << getType(codePoint)) 2783: & ((1 << UPPERCASE_LETTER) 2784: | (1 << LOWERCASE_LETTER) 2785: | (1 << TITLECASE_LETTER) 2786: | (1 << MODIFIER_LETTER) 2787: | (1 << OTHER_LETTER) 2788: | (1 << LETTER_NUMBER))) != 0; 2789: } 2790: 2791: /** 2792: * Determines if a character can follow the first letter in 2793: * a Unicode identifier. This includes letters, connecting punctuation, 2794: * digits, numeric letters, combining marks, non-spacing marks, and 2795: * isIdentifierIgnorable. 2796: * <br> 2797: * Unicode identifier extender = 2798: * [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]|[Mn]|[Mc]|[Nd]|[Pc]|[Cf]| 2799: * |U+0000-U+0008|U+000E-U+001B|U+007F-U+009F 2800: * 2801: * @param ch character to test 2802: * @return true if ch can follow the first letter in a Unicode identifier 2803: * @see #isIdentifierIgnorable(char) 2804: * @see #isJavaIdentifierPart(char) 2805: * @see #isLetterOrDigit(char) 2806: * @see #isUnicodeIdentifierStart(char) 2807: * @since 1.1 2808: */ 2809: public static boolean isUnicodeIdentifierPart(char ch) 2810: { 2811: int category = getType(ch); 2812: return ((1 << category) 2813: & ((1 << UPPERCASE_LETTER) 2814: | (1 << LOWERCASE_LETTER) 2815: | (1 << TITLECASE_LETTER) 2816: | (1 << MODIFIER_LETTER) 2817: | (1 << OTHER_LETTER) 2818: | (1 << NON_SPACING_MARK) 2819: | (1 << COMBINING_SPACING_MARK) 2820: | (1 << DECIMAL_DIGIT_NUMBER) 2821: | (1 << LETTER_NUMBER) 2822: | (1 << CONNECTOR_PUNCTUATION) 2823: | (1 << FORMAT))) != 0 2824: || (category == CONTROL && isIdentifierIgnorable(ch)); 2825: } 2826: 2827: /** 2828: * Determines if a character can follow the first letter in 2829: * a Unicode identifier. This includes letters, connecting punctuation, 2830: * digits, numeric letters, combining marks, non-spacing marks, and 2831: * isIdentifierIgnorable. Unlike isUnicodeIdentifierPart(char), this method 2832: * supports supplementary Unicode code points. 2833: * <br> 2834: * Unicode identifier extender = 2835: * [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]|[Mn]|[Mc]|[Nd]|[Pc]|[Cf]| 2836: * |U+0000-U+0008|U+000E-U+001B|U+007F-U+009F 2837: * 2838: * @param codePoint character to test 2839: * @return true if codePoint can follow the first letter in a Unicode 2840: * identifier 2841: * @see #isIdentifierIgnorable(int) 2842: * @see #isJavaIdentifierPart(int) 2843: * @see #isLetterOrDigit(int) 2844: * @see #isUnicodeIdentifierStart(int) 2845: * @since 1.5 2846: */ 2847: public static boolean isUnicodeIdentifierPart(int codePoint) 2848: { 2849: int category = getType(codePoint); 2850: return ((1 << category) 2851: & ((1 << UPPERCASE_LETTER) 2852: | (1 << LOWERCASE_LETTER) 2853: | (1 << TITLECASE_LETTER) 2854: | (1 << MODIFIER_LETTER) 2855: | (1 << OTHER_LETTER) 2856: | (1 << NON_SPACING_MARK) 2857: | (1 << COMBINING_SPACING_MARK) 2858: | (1 << DECIMAL_DIGIT_NUMBER) 2859: | (1 << LETTER_NUMBER) 2860: | (1 << CONNECTOR_PUNCTUATION) 2861: | (1 << FORMAT))) != 0 2862: || (category == CONTROL && isIdentifierIgnorable(codePoint)); 2863: } 2864: 2865: /** 2866: * Determines if a character is ignorable in a Unicode identifier. This 2867: * includes the non-whitespace ISO control characters (<code>'\u0000'</code> 2868: * through <code>'\u0008'</code>, <code>'\u000E'</code> through 2869: * <code>'\u001B'</code>, and <code>'\u007F'</code> through 2870: * <code>'\u009F'</code>), and FORMAT characters. 2871: * <br> 2872: * Unicode identifier ignorable = [Cf]|U+0000-U+0008|U+000E-U+001B 2873: * |U+007F-U+009F 2874: * 2875: * @param ch character to test 2876: * @return true if ch is ignorable in a Unicode or Java identifier 2877: * @see #isJavaIdentifierPart(char) 2878: * @see #isUnicodeIdentifierPart(char) 2879: * @since 1.1 2880: */ 2881: public static boolean isIdentifierIgnorable(char ch) 2882: { 2883: return (ch <= '\u009F' && (ch < '\t' || ch >= '\u007F' 2884: || (ch <= '\u001B' && ch >= '\u000E'))) 2885: || getType(ch) == FORMAT; 2886: } 2887: 2888: /** 2889: * Determines if a character is ignorable in a Unicode identifier. This 2890: * includes the non-whitespace ISO control characters (<code>'\u0000'</code> 2891: * through <code>'\u0008'</code>, <code>'\u000E'</code> through 2892: * <code>'\u001B'</code>, and <code>'\u007F'</code> through 2893: * <code>'\u009F'</code>), and FORMAT characters. Unlike 2894: * isIdentifierIgnorable(char), this method supports supplementary Unicode 2895: * code points. 2896: * <br> 2897: * Unicode identifier ignorable = [Cf]|U+0000-U+0008|U+000E-U+001B 2898: * |U+007F-U+009F 2899: * 2900: * @param codePoint character to test 2901: * @return true if codePoint is ignorable in a Unicode or Java identifier 2902: * @see #isJavaIdentifierPart(int) 2903: * @see #isUnicodeIdentifierPart(int) 2904: * @since 1.5 2905: */ 2906: public static boolean isIdentifierIgnorable(int codePoint) 2907: { 2908: return ((codePoint >= 0 && codePoint <= 0x0008) 2909: || (codePoint >= 0x000E && codePoint <= 0x001B) 2910: || (codePoint >= 0x007F && codePoint <= 0x009F) 2911: || getType(codePoint) == FORMAT); 2912: } 2913: 2914: /** 2915: * Converts a Unicode character into its lowercase equivalent mapping. 2916: * If a mapping does not exist, then the character passed is returned. 2917: * Note that isLowerCase(toLowerCase(ch)) does not always return true. 2918: * 2919: * @param ch character to convert to lowercase 2920: * @return lowercase mapping of ch, or ch if lowercase mapping does 2921: * not exist 2922: * @see #isLowerCase(char) 2923: * @see #isUpperCase(char) 2924: * @see #toTitleCase(char) 2925: * @see #toUpperCase(char) 2926: */ 2927: public static native char toLowerCase(char ch); 2928: 2929: /** 2930: * Converts a Unicode character into its lowercase equivalent mapping. 2931: * If a mapping does not exist, then the character passed is returned. 2932: * Note that isLowerCase(toLowerCase(codePoint)) does not always return true. 2933: * Unlike toLowerCase(char), this method supports supplementary Unicode 2934: * code points. 2935: * 2936: * @param codePoint character to convert to lowercase 2937: * @return lowercase mapping of codePoint, or codePoint if lowercase 2938: * mapping does not exist 2939: * @see #isLowerCase(int) 2940: * @see #isUpperCase(int) 2941: * @see #toTitleCase(int) 2942: * @see #toUpperCase(int) 2943: * @since 1.5 2944: */ 2945: public static native int toLowerCase(int codePoint); 2946: 2947: /** 2948: * Converts a Unicode character into its uppercase equivalent mapping. 2949: * If a mapping does not exist, then the character passed is returned. 2950: * Note that isUpperCase(toUpperCase(ch)) does not always return true. 2951: * 2952: * @param ch character to convert to uppercase 2953: * @return uppercase mapping of ch, or ch if uppercase mapping does 2954: * not exist 2955: * @see #isLowerCase(char) 2956: * @see #isUpperCase(char) 2957: * @see #toLowerCase(char) 2958: * @see #toTitleCase(char) 2959: */ 2960: public static native char toUpperCase(char ch); 2961: 2962: /** 2963: * Converts a Unicode character into its uppercase equivalent mapping. 2964: * If a mapping does not exist, then the character passed is returned. 2965: * Note that isUpperCase(toUpperCase(codePoint)) does not always return true. 2966: * Unlike toUpperCase(char), this method supports supplementary 2967: * Unicode code points. 2968: * 2969: * @param codePoint character to convert to uppercase 2970: * @return uppercase mapping of codePoint, or codePoint if uppercase 2971: * mapping does not exist 2972: * @see #isLowerCase(int) 2973: * @see #isUpperCase(int) 2974: * @see #toLowerCase(int) 2975: * @see #toTitleCase(int) 2976: * @since 1.5 2977: */ 2978: public static native int toUpperCase(int codePoint); 2979: 2980: /** 2981: * Converts a Unicode character into its titlecase equivalent mapping. 2982: * If a mapping does not exist, then the character passed is returned. 2983: * Note that isTitleCase(toTitleCase(ch)) does not always return true. 2984: * 2985: * @param ch character to convert to titlecase 2986: * @return titlecase mapping of ch, or ch if titlecase mapping does 2987: * not exist 2988: * @see #isTitleCase(char) 2989: * @see #toLowerCase(char) 2990: * @see #toUpperCase(char) 2991: */ 2992: public static native char toTitleCase(char ch); 2993: 2994: /** 2995: * Converts a Unicode character into its titlecase equivalent mapping. 2996: * If a mapping does not exist, then the character passed is returned. 2997: * Note that isTitleCase(toTitleCase(codePoint)) does not always return true. 2998: * Unlike toTitleCase(char), this method supports supplementary 2999: * Unicode code points. 3000: * 3001: * @param codePoint character to convert to titlecase 3002: * @return titlecase mapping of codePoint, or codePoint if titlecase 3003: * mapping does not exist 3004: * @see #isTitleCase(int) 3005: * @see #toLowerCase(int) 3006: * @see #toUpperCase(int) 3007: * @since 1.5 3008: */ 3009: public static native int toTitleCase(int codePoint); 3010: 3011: /** 3012: * Converts a character into a digit of the specified radix. If the radix 3013: * exceeds MIN_RADIX or MAX_RADIX, or if the result of getNumericValue(ch) 3014: * exceeds the radix, or if ch is not a decimal digit or in the case 3015: * insensitive set of 'a'-'z', the result is -1. 3016: * <br> 3017: * character argument boundary = [Nd]|U+0041-U+005A|U+0061-U+007A 3018: * |U+FF21-U+FF3A|U+FF41-U+FF5A 3019: * 3020: * @param ch character to convert into a digit 3021: * @param radix radix in which ch is a digit 3022: * @return digit which ch represents in radix, or -1 not a valid digit 3023: * @see #MIN_RADIX 3024: * @see #MAX_RADIX 3025: * @see #forDigit(int, int) 3026: * @see #isDigit(char) 3027: * @see #getNumericValue(char) 3028: */ 3029: public static native int digit(char ch, int radix); 3030: 3031: /** 3032: * Converts a character into a digit of the specified radix. If the radix 3033: * exceeds MIN_RADIX or MAX_RADIX, or if the result of getNumericValue(int) 3034: * exceeds the radix, or if codePoint is not a decimal digit or in the case 3035: * insensitive set of 'a'-'z', the result is -1. Unlike digit(char, int), 3036: * this method supports supplementary Unicode code points. 3037: * <br> 3038: * character argument boundary = [Nd]|U+0041-U+005A|U+0061-U+007A 3039: * |U+FF21-U+FF3A|U+FF41-U+FF5A 3040: * 3041: * @param codePoint character to convert into a digit 3042: * @param radix radix in which codePoint is a digit 3043: * @return digit which codePoint represents in radix, or -1 not a valid digit 3044: * @see #MIN_RADIX 3045: * @see #MAX_RADIX 3046: * @see #forDigit(int, int) 3047: * @see #isDigit(int) 3048: * @see #getNumericValue(int) 3049: * @since 1.5 3050: */ 3051: public static native int digit(int codePoint, int radix); 3052: 3053: /** 3054: * Returns the Unicode numeric value property of a character. For example, 3055: * <code>'\\u216C'</code> (the Roman numeral fifty) returns 50. 3056: * 3057: * <p>This method also returns values for the letters A through Z, (not 3058: * specified by Unicode), in these ranges: <code>'\u0041'</code> 3059: * through <code>'\u005A'</code> (uppercase); <code>'\u0061'</code> 3060: * through <code>'\u007A'</code> (lowercase); and <code>'\uFF21'</code> 3061: * through <code>'\uFF3A'</code>, <code>'\uFF41'</code> through 3062: * <code>'\uFF5A'</code> (full width variants). 3063: * 3064: * <p>If the character lacks a numeric value property, -1 is returned. 3065: * If the character has a numeric value property which is not representable 3066: * as a nonnegative integer, such as a fraction, -2 is returned. 3067: * 3068: * character argument boundary = [Nd]|[Nl]|[No]|U+0041-U+005A|U+0061-U+007A 3069: * |U+FF21-U+FF3A|U+FF41-U+FF5A 3070: * 3071: * @param ch character from which the numeric value property will 3072: * be retrieved 3073: * @return the numeric value property of ch, or -1 if it does not exist, or 3074: * -2 if it is not representable as a nonnegative integer 3075: * @see #forDigit(int, int) 3076: * @see #digit(char, int) 3077: * @see #isDigit(char) 3078: * @since 1.1 3079: */ 3080: public static native int getNumericValue(char ch); 3081: 3082: /** 3083: * Returns the Unicode numeric value property of a character. For example, 3084: * <code>'\\u216C'</code> (the Roman numeral fifty) returns 50. 3085: * 3086: * <p>This method also returns values for the letters A through Z, (not 3087: * specified by Unicode), in these ranges: <code>'\u0041'</code> 3088: * through <code>'\u005A'</code> (uppercase); <code>'\u0061'</code> 3089: * through <code>'\u007A'</code> (lowercase); and <code>'\uFF21'</code> 3090: * through <code>'\uFF3A'</code>, <code>'\uFF41'</code> through 3091: * <code>'\uFF5A'</code> (full width variants). 3092: * 3093: * <p>If the character lacks a numeric value property, -1 is returned. 3094: * If the character has a numeric value property which is not representable 3095: * as a nonnegative integer, such as a fraction, -2 is returned. 3096: * 3097: * Unlike getNumericValue(char), this method supports supplementary Unicode 3098: * code points. 3099: * 3100: * character argument boundary = [Nd]|[Nl]|[No]|U+0041-U+005A|U+0061-U+007A 3101: * |U+FF21-U+FF3A|U+FF41-U+FF5A 3102: * 3103: * @param codePoint character from which the numeric value property will 3104: * be retrieved 3105: * @return the numeric value property of codePoint, or -1 if it does not 3106: * exist, or -2 if it is not representable as a nonnegative integer 3107: * @see #forDigit(int, int) 3108: * @see #digit(int, int) 3109: * @see #isDigit(int) 3110: * @since 1.5 3111: */ 3112: public static native int getNumericValue(int codePoint); 3113: 3114: /** 3115: * Determines if a character is a ISO-LATIN-1 space. This is only the five 3116: * characters <code>'\t'</code>, <code>'\n'</code>, <code>'\f'</code>, 3117: * <code>'\r'</code>, and <code>' '</code>. 3118: * <br> 3119: * Java space = U+0020|U+0009|U+000A|U+000C|U+000D 3120: * 3121: * @param ch character to test 3122: * @return true if ch is a space, else false 3123: * @deprecated Replaced by {@link #isWhitespace(char)} 3124: * @see #isSpaceChar(char) 3125: * @see #isWhitespace(char) 3126: */ 3127: public static boolean isSpace(char ch) 3128: { 3129: // Performing the subtraction up front alleviates need to compare longs. 3130: return ch-- <= ' ' && ((1 << ch) 3131: & ((1 << (' ' - 1)) 3132: | (1 << ('\t' - 1)) 3133: | (1 << ('\n' - 1)) 3134: | (1 << ('\r' - 1)) 3135: | (1 << ('\f' - 1)))) != 0; 3136: } 3137: 3138: /** 3139: * Determines if a character is a Unicode space character. This includes 3140: * SPACE_SEPARATOR, LINE_SEPARATOR, and PARAGRAPH_SEPARATOR. 3141: * <br> 3142: * Unicode space = [Zs]|[Zp]|[Zl] 3143: * 3144: * @param ch character to test 3145: * @return true if ch is a Unicode space, else false 3146: * @see #isWhitespace(char) 3147: * @since 1.1 3148: */ 3149: public static boolean isSpaceChar(char ch) 3150: { 3151: return ((1 << getType(ch)) 3152: & ((1 << SPACE_SEPARATOR) 3153: | (1 << LINE_SEPARATOR) 3154: | (1 << PARAGRAPH_SEPARATOR))) != 0; 3155: } 3156: 3157: /** 3158: * Determines if a character is a Unicode space character. This includes 3159: * SPACE_SEPARATOR, LINE_SEPARATOR, and PARAGRAPH_SEPARATOR. Unlike 3160: * isSpaceChar(char), this method supports supplementary Unicode code points. 3161: * <br> 3162: * Unicode space = [Zs]|[Zp]|[Zl] 3163: * 3164: * @param codePoint character to test 3165: * @return true if codePoint is a Unicode space, else false 3166: * @see #isWhitespace(int) 3167: * @since 1.5 3168: */ 3169: public static boolean isSpaceChar(int codePoint) 3170: { 3171: return ((1 << getType(codePoint)) 3172: & ((1 << SPACE_SEPARATOR) 3173: | (1 << LINE_SEPARATOR) 3174: | (1 << PARAGRAPH_SEPARATOR))) != 0; 3175: } 3176: 3177: /** 3178: * Determines if a character is Java whitespace. This includes Unicode 3179: * space characters (SPACE_SEPARATOR, LINE_SEPARATOR, and 3180: * PARAGRAPH_SEPARATOR) except the non-breaking spaces 3181: * (<code>'\u00A0'</code>, <code>'\u2007'</code>, and <code>'\u202F'</code>); 3182: * and these characters: <code>'\u0009'</code>, <code>'\u000A'</code>, 3183: * <code>'\u000B'</code>, <code>'\u000C'</code>, <code>'\u000D'</code>, 3184: * <code>'\u001C'</code>, <code>'\u001D'</code>, <code>'\u001E'</code>, 3185: * and <code>'\u001F'</code>. 3186: * <br> 3187: * Java whitespace = ([Zs] not Nb)|[Zl]|[Zp]|U+0009-U+000D|U+001C-U+001F 3188: * 3189: * @param ch character to test 3190: * @return true if ch is Java whitespace, else false 3191: * @see #isSpaceChar(char) 3192: * @since 1.1 3193: */ 3194: public static boolean isWhitespace(char ch) 3195: { 3196: int attr = readChar(ch); 3197: return ((((1 << (attr & TYPE_MASK)) 3198: & ((1 << SPACE_SEPARATOR) 3199: | (1 << LINE_SEPARATOR) 3200: | (1 << PARAGRAPH_SEPARATOR))) != 0) 3201: && (attr & NO_BREAK_MASK) == 0) 3202: || (ch <= '\u001F' && ((1 << ch) 3203: & ((1 << '\t') 3204: | (1 << '\n') 3205: | (1 << '\u000B') 3206: | (1 << '\u000C') 3207: | (1 << '\r') 3208: | (1 << '\u001C') 3209: | (1 << '\u001D') 3210: | (1 << '\u001E') 3211: | (1 << '\u001F'))) != 0); 3212: } 3213: 3214: /** 3215: * Determines if a character is Java whitespace. This includes Unicode 3216: * space characters (SPACE_SEPARATOR, LINE_SEPARATOR, and 3217: * PARAGRAPH_SEPARATOR) except the non-breaking spaces 3218: * (<code>'\u00A0'</code>, <code>'\u2007'</code>, and <code>'\u202F'</code>); 3219: * and these characters: <code>'\u0009'</code>, <code>'\u000A'</code>, 3220: * <code>'\u000B'</code>, <code>'\u000C'</code>, <code>'\u000D'</code>, 3221: * <code>'\u001C'</code>, <code>'\u001D'</code>, <code>'\u001E'</code>, 3222: * and <code>'\u001F'</code>. Unlike isWhitespace(char), this method 3223: * supports supplementary Unicode code points. 3224: * <br> 3225: * Java whitespace = ([Zs] not Nb)|[Zl]|[Zp]|U+0009-U+000D|U+001C-U+001F 3226: * 3227: * @param codePoint character to test 3228: * @return true if codePoint is Java whitespace, else false 3229: * @see #isSpaceChar(int) 3230: * @since 1.5 3231: */ 3232: public static boolean isWhitespace(int codePoint) 3233: { 3234: int plane = codePoint >>> 16; 3235: if (plane > 2 && plane != 14) 3236: return false; 3237: int attr = readCodePoint(codePoint); 3238: return ((((1 << (attr & TYPE_MASK)) 3239: & ((1 << SPACE_SEPARATOR) 3240: | (1 << LINE_SEPARATOR) 3241: | (1 << PARAGRAPH_SEPARATOR))) != 0) 3242: && (attr & NO_BREAK_MASK) == 0) 3243: || (codePoint <= '\u001F' && ((1 << codePoint) 3244: & ((1 << '\t') 3245: | (1 << '\n') 3246: | (1 << '\u000B') 3247: | (1 << '\u000C') 3248: | (1 << '\r') 3249: | (1 << '\u001C') 3250: | (1 << '\u001D') 3251: | (1 << '\u001E') 3252: | (1 << '\u001F'))) != 0); 3253: } 3254: 3255: /** 3256: * Determines if a character has the ISO Control property. 3257: * <br> 3258: * ISO Control = [Cc] 3259: * 3260: * @param ch character to test 3261: * @return true if ch is an ISO Control character, else false 3262: * @see #isSpaceChar(char) 3263: * @see #isWhitespace(char) 3264: * @since 1.1 3265: */ 3266: public static boolean isISOControl(char ch) 3267: { 3268: return getType(ch) == CONTROL; 3269: } 3270: 3271: /** 3272: * Determines if a character has the ISO Control property. Unlike 3273: * isISOControl(char), this method supports supplementary unicode 3274: * code points. 3275: * <br> 3276: * ISO Control = [Cc] 3277: * 3278: * @param codePoint character to test 3279: * @return true if codePoint is an ISO Control character, else false 3280: * @see #isSpaceChar(int) 3281: * @see #isWhitespace(int) 3282: * @since 1.5 3283: */ 3284: public static boolean isISOControl(int codePoint) 3285: { 3286: return getType(codePoint) == CONTROL; 3287: } 3288: 3289: /** 3290: * Returns the Unicode general category property of a character. 3291: * 3292: * @param ch character from which the general category property will 3293: * be retrieved 3294: * @return the character category property of ch as an integer 3295: * @see #UNASSIGNED 3296: * @see #UPPERCASE_LETTER 3297: * @see #LOWERCASE_LETTER 3298: * @see #TITLECASE_LETTER 3299: * @see #MODIFIER_LETTER 3300: * @see #OTHER_LETTER 3301: * @see #NON_SPACING_MARK 3302: * @see #ENCLOSING_MARK 3303: * @see #COMBINING_SPACING_MARK 3304: * @see #DECIMAL_DIGIT_NUMBER 3305: * @see #LETTER_NUMBER 3306: * @see #OTHER_NUMBER 3307: * @see #SPACE_SEPARATOR 3308: * @see #LINE_SEPARATOR 3309: * @see #PARAGRAPH_SEPARATOR 3310: * @see #CONTROL 3311: * @see #FORMAT 3312: * @see #PRIVATE_USE 3313: * @see #SURROGATE 3314: * @see #DASH_PUNCTUATION 3315: * @see #START_PUNCTUATION 3316: * @see #END_PUNCTUATION 3317: * @see #CONNECTOR_PUNCTUATION 3318: * @see #OTHER_PUNCTUATION 3319: * @see #MATH_SYMBOL 3320: * @see #CURRENCY_SYMBOL 3321: * @see #MODIFIER_SYMBOL 3322: * @see #INITIAL_QUOTE_PUNCTUATION 3323: * @see #FINAL_QUOTE_PUNCTUATION 3324: * @since 1.1 3325: */ 3326: public static native int getType(char ch); 3327: 3328: /** 3329: * Returns the Unicode general category property of a character. Supports 3330: * supplementary Unicode code points. 3331: * 3332: * @param codePoint character from which the general category property will 3333: * be retrieved 3334: * @return the character category property of codePoint as an integer 3335: * @see #UNASSIGNED 3336: * @see #UPPERCASE_LETTER 3337: * @see #LOWERCASE_LETTER 3338: * @see #TITLECASE_LETTER 3339: * @see #MODIFIER_LETTER 3340: * @see #OTHER_LETTER 3341: * @see #NON_SPACING_MARK 3342: * @see #ENCLOSING_MARK 3343: * @see #COMBINING_SPACING_MARK 3344: * @see #DECIMAL_DIGIT_NUMBER 3345: * @see #LETTER_NUMBER 3346: * @see #OTHER_NUMBER 3347: * @see #SPACE_SEPARATOR 3348: * @see #LINE_SEPARATOR 3349: * @see #PARAGRAPH_SEPARATOR 3350: * @see #CONTROL 3351: * @see #FORMAT 3352: * @see #PRIVATE_USE 3353: * @see #SURROGATE 3354: * @see #DASH_PUNCTUATION 3355: * @see #START_PUNCTUATION 3356: * @see #END_PUNCTUATION 3357: * @see #CONNECTOR_PUNCTUATION 3358: * @see #OTHER_PUNCTUATION 3359: * @see #MATH_SYMBOL 3360: * @see #CURRENCY_SYMBOL 3361: * @see #MODIFIER_SYMBOL 3362: * @see #INITIAL_QUOTE_PUNCTUATION 3363: * @see #FINAL_QUOTE_PUNCTUATION 3364: * @since 1.5 3365: */ 3366: public static native int getType(int codePoint); 3367: 3368: /** 3369: * Converts a digit into a character which represents that digit 3370: * in a specified radix. If the radix exceeds MIN_RADIX or MAX_RADIX, 3371: * or the digit exceeds the radix, then the null character <code>'\0'</code> 3372: * is returned. Otherwise the return value is in '0'-'9' and 'a'-'z'. 3373: * <br> 3374: * return value boundary = U+0030-U+0039|U+0061-U+007A 3375: * 3376: * @param digit digit to be converted into a character 3377: * @param radix radix of digit 3378: * @return character representing digit in radix, or '\0' 3379: * @see #MIN_RADIX 3380: * @see #MAX_RADIX 3381: * @see #digit(char, int) 3382: */ 3383: public static char forDigit(int digit, int radix) 3384: { 3385: if (radix < MIN_RADIX || radix > MAX_RADIX 3386: || digit < 0 || digit >= radix) 3387: return '\0'; 3388: return (char) (digit < 10 ? ('0' + digit) : ('a' - 10 + digit)); 3389: } 3390: 3391: /** 3392: * Returns the Unicode directionality property of the character. This 3393: * is used in the visual ordering of text. 3394: * 3395: * @param ch the character to look up 3396: * @return the directionality constant, or DIRECTIONALITY_UNDEFINED 3397: * @see #DIRECTIONALITY_UNDEFINED 3398: * @see #DIRECTIONALITY_LEFT_TO_RIGHT 3399: * @see #DIRECTIONALITY_RIGHT_TO_LEFT 3400: * @see #DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC 3401: * @see #DIRECTIONALITY_EUROPEAN_NUMBER 3402: * @see #DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR 3403: * @see #DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR 3404: * @see #DIRECTIONALITY_ARABIC_NUMBER 3405: * @see #DIRECTIONALITY_COMMON_NUMBER_SEPARATOR 3406: * @see #DIRECTIONALITY_NONSPACING_MARK 3407: * @see #DIRECTIONALITY_BOUNDARY_NEUTRAL 3408: * @see #DIRECTIONALITY_PARAGRAPH_SEPARATOR 3409: * @see #DIRECTIONALITY_SEGMENT_SEPARATOR 3410: * @see #DIRECTIONALITY_WHITESPACE 3411: * @see #DIRECTIONALITY_OTHER_NEUTRALS 3412: * @see #DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING 3413: * @see #DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE 3414: * @see #DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING 3415: * @see #DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE 3416: * @see #DIRECTIONALITY_POP_DIRECTIONAL_FORMAT 3417: * @since 1.4 3418: */ 3419: public static native byte getDirectionality(char ch); 3420: 3421: /** 3422: * Returns the Unicode directionality property of the character. This 3423: * is used in the visual ordering of text. Unlike getDirectionality(char), 3424: * this method supports supplementary Unicode code points. 3425: * 3426: * @param codePoint the character to look up 3427: * @return the directionality constant, or DIRECTIONALITY_UNDEFINED 3428: * @see #DIRECTIONALITY_UNDEFINED 3429: * @see #DIRECTIONALITY_LEFT_TO_RIGHT 3430: * @see #DIRECTIONALITY_RIGHT_TO_LEFT 3431: * @see #DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC 3432: * @see #DIRECTIONALITY_EUROPEAN_NUMBER 3433: * @see #DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR 3434: * @see #DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR 3435: * @see #DIRECTIONALITY_ARABIC_NUMBER 3436: * @see #DIRECTIONALITY_COMMON_NUMBER_SEPARATOR 3437: * @see #DIRECTIONALITY_NONSPACING_MARK 3438: * @see #DIRECTIONALITY_BOUNDARY_NEUTRAL 3439: * @see #DIRECTIONALITY_PARAGRAPH_SEPARATOR 3440: * @see #DIRECTIONALITY_SEGMENT_SEPARATOR 3441: * @see #DIRECTIONALITY_WHITESPACE 3442: * @see #DIRECTIONALITY_OTHER_NEUTRALS 3443: * @see #DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING 3444: * @see #DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE 3445: * @see #DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING 3446: * @see #DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE 3447: * @see #DIRECTIONALITY_POP_DIRECTIONAL_FORMAT 3448: * @since 1.5 3449: */ 3450: public static native byte getDirectionality(int codePoint); 3451: 3452: /** 3453: * Determines whether the character is mirrored according to Unicode. For 3454: * example, <code>\u0028</code> (LEFT PARENTHESIS) appears as '(' in 3455: * left-to-right text, but ')' in right-to-left text. 3456: * 3457: * @param ch the character to look up 3458: * @return true if the character is mirrored 3459: * @since 1.4 3460: */ 3461: public static boolean isMirrored(char ch) 3462: { 3463: return (readChar(ch) & MIRROR_MASK) != 0; 3464: } 3465: 3466: /** 3467: * Determines whether the character is mirrored according to Unicode. For 3468: * example, <code>\u0028</code> (LEFT PARENTHESIS) appears as '(' in 3469: * left-to-right text, but ')' in right-to-left text. Unlike 3470: * isMirrored(char), this method supports supplementary Unicode code points. 3471: * 3472: * @param codePoint the character to look up 3473: * @return true if the character is mirrored 3474: * @since 1.5 3475: */ 3476: public static boolean isMirrored(int codePoint) 3477: { 3478: int plane = codePoint >>> 16; 3479: if (plane > 2 && plane != 14) 3480: return false; 3481: return (readCodePoint(codePoint) & MIRROR_MASK) != 0; 3482: } 3483: 3484: /** 3485: * Compares another Character to this Character, numerically. 3486: * 3487: * @param anotherCharacter Character to compare with this Character 3488: * @return a negative integer if this Character is less than 3489: * anotherCharacter, zero if this Character is equal, and 3490: * a positive integer if this Character is greater 3491: * @throws NullPointerException if anotherCharacter is null 3492: * @since 1.2 3493: */ 3494: public int compareTo(Character anotherCharacter) 3495: { 3496: return value - anotherCharacter.value; 3497: } 3498: 3499: /** 3500: * Compares an object to this Character. Assuming the object is a 3501: * Character object, this method performs the same comparison as 3502: * compareTo(Character). 3503: * 3504: * @param o object to compare 3505: * @return the comparison value 3506: * @throws ClassCastException if o is not a Character object 3507: * @throws NullPointerException if o is null 3508: * @see #compareTo(Character) 3509: * @since 1.2 3510: */ 3511: public int compareTo(Object o) 3512: { 3513: return compareTo((Character) o); 3514: } 3515: 3516: /** 3517: * Returns an <code>Character</code> object wrapping the value. 3518: * In contrast to the <code>Character</code> constructor, this method 3519: * will cache some values. It is used by boxing conversion. 3520: * 3521: * @param val the value to wrap 3522: * @return the <code>Character</code> 3523: * 3524: * @since 1.5 3525: */ 3526: public static Character valueOf(char val) 3527: { 3528: if (val > MAX_CACHE) 3529: return new Character(val); 3530: synchronized (charCache) 3531: { 3532: if (charCache[val - MIN_VALUE] == null) 3533: charCache[val - MIN_VALUE] = new Character(val); 3534: return charCache[val - MIN_VALUE]; 3535: } 3536: } 3537: 3538: /** 3539: * Reverse the bytes in val. 3540: * @since 1.5 3541: */ 3542: public static char reverseBytes(char val) 3543: { 3544: return (char) (((val >> 8) & 0xff) | ((val << 8) & 0xff00)); 3545: } 3546: 3547: /** 3548: * Converts a unicode code point to a UTF-16 representation of that 3549: * code point. 3550: * 3551: * @param codePoint the unicode code point 3552: * 3553: * @return the UTF-16 representation of that code point 3554: * 3555: * @throws IllegalArgumentException if the code point is not a valid 3556: * unicode code point 3557: * 3558: * @since 1.5 3559: */ 3560: public static char[] toChars(int codePoint) 3561: { 3562: char[] result = new char[charCount(codePoint)]; 3563: int ignore = toChars(codePoint, result, 0); 3564: return result; 3565: } 3566: 3567: /** 3568: * Converts a unicode code point to its UTF-16 representation. 3569: * 3570: * @param codePoint the unicode code point 3571: * @param dst the target char array 3572: * @param dstIndex the start index for the target 3573: * 3574: * @return number of characters written to <code>dst</code> 3575: * 3576: * @throws IllegalArgumentException if <code>codePoint</code> is not a 3577: * valid unicode code point 3578: * @throws NullPointerException if <code>dst</code> is <code>null</code> 3579: * @throws IndexOutOfBoundsException if <code>dstIndex</code> is not valid 3580: * in <code>dst</code> or if the UTF-16 representation does not 3581: * fit into <code>dst</code> 3582: * 3583: * @since 1.5 3584: */ 3585: public static int toChars(int codePoint, char[] dst, int dstIndex) 3586: { 3587: if (!isValidCodePoint(codePoint)) 3588: { 3589: throw new IllegalArgumentException("not a valid code point: " 3590: + codePoint); 3591: } 3592: 3593: int result; 3594: if (isSupplementaryCodePoint(codePoint)) 3595: { 3596: // Write second char first to cause IndexOutOfBoundsException 3597: // immediately. 3598: final int cp2 = codePoint - 0x10000; 3599: dst[dstIndex + 1] = (char) ((cp2 % 0x400) + (int) MIN_LOW_SURROGATE); 3600: dst[dstIndex] = (char) ((cp2 / 0x400) + (int) MIN_HIGH_SURROGATE); 3601: result = 2; 3602: } 3603: else 3604: { 3605: dst[dstIndex] = (char) codePoint; 3606: result = 1; 3607: } 3608: return result; 3609: } 3610: 3611: /** 3612: * Return number of 16-bit characters required to represent the given 3613: * code point. 3614: * 3615: * @param codePoint a unicode code point 3616: * 3617: * @return 2 if codePoint >= 0x10000, 1 otherwise. 3618: * 3619: * @since 1.5 3620: */ 3621: public static int charCount(int codePoint) 3622: { 3623: return 3624: (codePoint >= MIN_SUPPLEMENTARY_CODE_POINT) 3625: ? 2 3626: : 1; 3627: } 3628: 3629: /** 3630: * Determines whether the specified code point is 3631: * in the range 0x10000 .. 0x10FFFF, i.e. the character is within the Unicode 3632: * supplementary character range. 3633: * 3634: * @param codePoint a Unicode code point 3635: * 3636: * @return <code>true</code> if code point is in supplementary range 3637: * 3638: * @since 1.5 3639: */ 3640: public static boolean isSupplementaryCodePoint(int codePoint) 3641: { 3642: return codePoint >= MIN_SUPPLEMENTARY_CODE_POINT 3643: && codePoint <= MAX_CODE_POINT; 3644: } 3645: 3646: /** 3647: * Determines whether the specified code point is 3648: * in the range 0x0000 .. 0x10FFFF, i.e. it is a valid Unicode code point. 3649: * 3650: * @param codePoint a Unicode code point 3651: * 3652: * @return <code>true</code> if code point is valid 3653: * 3654: * @since 1.5 3655: */ 3656: public static boolean isValidCodePoint(int codePoint) 3657: { 3658: return codePoint >= MIN_CODE_POINT && codePoint <= MAX_CODE_POINT; 3659: } 3660: 3661: /** 3662: * Return true if the given character is a high surrogate. 3663: * @param ch the character 3664: * @return true if the character is a high surrogate character 3665: * 3666: * @since 1.5 3667: */ 3668: public static boolean isHighSurrogate(char ch) 3669: { 3670: return ch >= MIN_HIGH_SURROGATE && ch <= MAX_HIGH_SURROGATE; 3671: } 3672: 3673: /** 3674: * Return true if the given character is a low surrogate. 3675: * @param ch the character 3676: * @return true if the character is a low surrogate character 3677: * 3678: * @since 1.5 3679: */ 3680: public static boolean isLowSurrogate(char ch) 3681: { 3682: return ch >= MIN_LOW_SURROGATE && ch <= MAX_LOW_SURROGATE; 3683: } 3684: 3685: /** 3686: * Return true if the given characters compose a surrogate pair. 3687: * This is true if the first character is a high surrogate and the 3688: * second character is a low surrogate. 3689: * @param ch1 the first character 3690: * @param ch2 the first character 3691: * @return true if the characters compose a surrogate pair 3692: * 3693: * @since 1.5 3694: */ 3695: public static boolean isSurrogatePair(char ch1, char ch2) 3696: { 3697: return isHighSurrogate(ch1) && isLowSurrogate(ch2); 3698: } 3699: 3700: /** 3701: * Given a valid surrogate pair, this returns the corresponding 3702: * code point. 3703: * @param high the high character of the pair 3704: * @param low the low character of the pair 3705: * @return the corresponding code point 3706: * 3707: * @since 1.5 3708: */ 3709: public static int toCodePoint(char high, char low) 3710: { 3711: return ((high - MIN_HIGH_SURROGATE) * 0x400) + 3712: (low - MIN_LOW_SURROGATE) + 0x10000; 3713: } 3714: 3715: /** 3716: * Get the code point at the specified index in the CharSequence. 3717: * This is like CharSequence#charAt(int), but if the character is 3718: * the start of a surrogate pair, and there is a following 3719: * character, and this character completes the pair, then the 3720: * corresponding supplementary code point is returned. Otherwise, 3721: * the character at the index is returned. 3722: * 3723: * @param sequence the CharSequence 3724: * @param index the index of the codepoint to get, starting at 0 3725: * @return the codepoint at the specified index 3726: * @throws IndexOutOfBoundsException if index is negative or >= length() 3727: * @since 1.5 3728: */ 3729: public static int codePointAt(CharSequence sequence, int index) 3730: { 3731: int len = sequence.length(); 3732: if (index < 0 || index >= len) 3733: throw new IndexOutOfBoundsException(); 3734: char high = sequence.charAt(index); 3735: if (! isHighSurrogate(high) || ++index >= len) 3736: return high; 3737: char low = sequence.charAt(index); 3738: if (! isLowSurrogate(low)) 3739: return high; 3740: return toCodePoint(high, low); 3741: } 3742: 3743: /** 3744: * Get the code point at the specified index in the CharSequence. 3745: * If the character is the start of a surrogate pair, and there is a 3746: * following character, and this character completes the pair, then 3747: * the corresponding supplementary code point is returned. 3748: * Otherwise, the character at the index is returned. 3749: * 3750: * @param chars the character array in which to look 3751: * @param index the index of the codepoint to get, starting at 0 3752: * @return the codepoint at the specified index 3753: * @throws IndexOutOfBoundsException if index is negative or >= length() 3754: * @since 1.5 3755: */ 3756: public static int codePointAt(char[] chars, int index) 3757: { 3758: return codePointAt(chars, index, chars.length); 3759: } 3760: 3761: /** 3762: * Get the code point at the specified index in the CharSequence. 3763: * If the character is the start of a surrogate pair, and there is a 3764: * following character within the specified range, and this 3765: * character completes the pair, then the corresponding 3766: * supplementary code point is returned. Otherwise, the character 3767: * at the index is returned. 3768: * 3769: * @param chars the character array in which to look 3770: * @param index the index of the codepoint to get, starting at 0 3771: * @param limit the limit past which characters should not be examined 3772: * @return the codepoint at the specified index 3773: * @throws IndexOutOfBoundsException if index is negative or >= 3774: * limit, or if limit is negative or >= the length of the array 3775: * @since 1.5 3776: */ 3777: public static int codePointAt(char[] chars, int index, int limit) 3778: { 3779: if (index < 0 || index >= limit || limit < 0 || limit >= chars.length) 3780: throw new IndexOutOfBoundsException(); 3781: char high = chars[index]; 3782: if (! isHighSurrogate(high) || ++index >= limit) 3783: return high; 3784: char low = chars[index]; 3785: if (! isLowSurrogate(low)) 3786: return high; 3787: return toCodePoint(high, low); 3788: } 3789: 3790: /** 3791: * Get the code point before the specified index. This is like 3792: * #codePointAt(char[], int), but checks the characters at 3793: * <code>index-1</code> and <code>index-2</code> to see if they form 3794: * a supplementary code point. If they do not, the character at 3795: * <code>index-1</code> is returned. 3796: * 3797: * @param chars the character array 3798: * @param index the index just past the codepoint to get, starting at 0 3799: * @return the codepoint at the specified index 3800: * @throws IndexOutOfBoundsException if index is negative or >= length() 3801: * @since 1.5 3802: */ 3803: public static int codePointBefore(char[] chars, int index) 3804: { 3805: return codePointBefore(chars, index, 1); 3806: } 3807: 3808: /** 3809: * Get the code point before the specified index. This is like 3810: * #codePointAt(char[], int), but checks the characters at 3811: * <code>index-1</code> and <code>index-2</code> to see if they form 3812: * a supplementary code point. If they do not, the character at 3813: * <code>index-1</code> is returned. The start parameter is used to 3814: * limit the range of the array which may be examined. 3815: * 3816: * @param chars the character array 3817: * @param index the index just past the codepoint to get, starting at 0 3818: * @param start the index before which characters should not be examined 3819: * @return the codepoint at the specified index 3820: * @throws IndexOutOfBoundsException if index is > start or > 3821: * the length of the array, or if limit is negative or >= the 3822: * length of the array 3823: * @since 1.5 3824: */ 3825: public static int codePointBefore(char[] chars, int index, int start) 3826: { 3827: if (index < start || index > chars.length 3828: || start < 0 || start >= chars.length) 3829: throw new IndexOutOfBoundsException(); 3830: --index; 3831: char low = chars[index]; 3832: if (! isLowSurrogate(low) || --index < start) 3833: return low; 3834: char high = chars[index]; 3835: if (! isHighSurrogate(high)) 3836: return low; 3837: return toCodePoint(high, low); 3838: } 3839: 3840: /** 3841: * Get the code point before the specified index. This is like 3842: * #codePointAt(CharSequence, int), but checks the characters at 3843: * <code>index-1</code> and <code>index-2</code> to see if they form 3844: * a supplementary code point. If they do not, the character at 3845: * <code>index-1</code> is returned. 3846: * 3847: * @param sequence the CharSequence 3848: * @param index the index just past the codepoint to get, starting at 0 3849: * @return the codepoint at the specified index 3850: * @throws IndexOutOfBoundsException if index is negative or >= length() 3851: * @since 1.5 3852: */ 3853: public static int codePointBefore(CharSequence sequence, int index) 3854: { 3855: int len = sequence.length(); 3856: if (index < 1 || index > len) 3857: throw new IndexOutOfBoundsException(); 3858: --index; 3859: char low = sequence.charAt(index); 3860: if (! isLowSurrogate(low) || --index < 0) 3861: return low; 3862: char high = sequence.charAt(index); 3863: if (! isHighSurrogate(high)) 3864: return low; 3865: return toCodePoint(high, low); 3866: } 3867: } // class Character