Frames | No Frames |
1: /* StreamTokenizer.java -- parses streams of characters into tokens 2: Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003 Free Software Foundation 3: 4: This file is part of GNU Classpath. 5: 6: GNU Classpath is free software; you can redistribute it and/or modify 7: it under the terms of the GNU General Public License as published by 8: the Free Software Foundation; either version 2, or (at your option) 9: any later version. 10: 11: GNU Classpath is distributed in the hope that it will be useful, but 12: WITHOUT ANY WARRANTY; without even the implied warranty of 13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14: General Public License for more details. 15: 16: You should have received a copy of the GNU General Public License 17: along with GNU Classpath; see the file COPYING. If not, write to the 18: Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 19: 02110-1301 USA. 20: 21: Linking this library statically or dynamically with other modules is 22: making a combined work based on this library. Thus, the terms and 23: conditions of the GNU General Public License cover the whole 24: combination. 25: 26: As a special exception, the copyright holders of this library give you 27: permission to link this library with independent modules to produce an 28: executable, regardless of the license terms of these independent 29: modules, and to copy and distribute the resulting executable under 30: terms of your choice, provided that you also meet, for each linked 31: independent module, the terms and conditions of the license of that 32: module. An independent module is a module which is not derived from 33: or based on this library. If you modify this library, you may extend 34: this exception to your version of the library, but you are not 35: obligated to do so. If you do not wish to do so, delete this 36: exception statement from your version. */ 37: 38: package java.io; 39: 40: /** 41: * This class parses streams of characters into tokens. There are a 42: * million-zillion flags that can be set to control the parsing, as 43: * described under the various method headings. 44: * 45: * @author Warren Levy (warrenl@cygnus.com) 46: * @date October 25, 1998. 47: */ 48: /* Written using "Java Class Libraries", 2nd edition, ISBN 0-201-31002-3 49: * "The Java Language Specification", ISBN 0-201-63451-1 50: * plus online API docs for JDK 1.2 beta from http://www.javasoft.com. 51: * Status: Believed complete and correct. 52: */ 53: 54: public class StreamTokenizer 55: { 56: /** A constant indicating that the end of the stream has been read. */ 57: public static final int TT_EOF = -1; 58: 59: /** A constant indicating that the end of the line has been read. */ 60: public static final int TT_EOL = '\n'; 61: 62: /** A constant indicating that a number token has been read. */ 63: public static final int TT_NUMBER = -2; 64: 65: /** A constant indicating that a word token has been read. */ 66: public static final int TT_WORD = -3; 67: 68: /** A constant indicating that no tokens have been read yet. */ 69: private static final int TT_NONE = -4; 70: 71: /** 72: * Contains the type of the token read resulting from a call to nextToken 73: * The rules are as follows: 74: * <ul> 75: * <li>For a token consisting of a single ordinary character, this is the 76: * value of that character.</li> 77: * <li>For a quoted string, this is the value of the quote character</li> 78: * <li>For a word, this is TT_WORD</li> 79: * <li>For a number, this is TT_NUMBER</li> 80: * <li>For the end of the line, this is TT_EOL</li> 81: * <li>For the end of the stream, this is TT_EOF</li> 82: * </ul> 83: */ 84: public int ttype = TT_NONE; 85: 86: /** The String associated with word and string tokens. */ 87: public String sval; 88: 89: /** The numeric value associated with number tokens. */ 90: public double nval; 91: 92: /* Indicates whether end-of-line is recognized as a token. */ 93: private boolean eolSignificant = false; 94: 95: /* Indicates whether word tokens are automatically made lower case. */ 96: private boolean lowerCase = false; 97: 98: /* Indicates whether C++ style comments are recognized and skipped. */ 99: private boolean slashSlash = false; 100: 101: /* Indicates whether C style comments are recognized and skipped. */ 102: private boolean slashStar = false; 103: 104: /* Attribute tables of each byte from 0x00 to 0xFF. */ 105: private boolean[] whitespace = new boolean[256]; 106: private boolean[] alphabetic = new boolean[256]; 107: private boolean[] numeric = new boolean[256]; 108: private boolean[] quote = new boolean[256]; 109: private boolean[] comment = new boolean[256]; 110: 111: /* The Reader associated with this class. */ 112: private PushbackReader in; 113: 114: /* Indicates if a token has been pushed back. */ 115: private boolean pushedBack = false; 116: 117: /* Contains the current line number of the reader. */ 118: private int lineNumber = 1; 119: 120: /** 121: * This method reads bytes from an <code>InputStream</code> and tokenizes 122: * them. For details on how this method operates by default, see 123: * <code>StreamTokenizer(Reader)</code>. 124: * 125: * @param is The <code>InputStream</code> to read from 126: * 127: * @deprecated Since JDK 1.1. 128: */ 129: public StreamTokenizer(InputStream is) 130: { 131: this(new InputStreamReader(is)); 132: } 133: 134: /** 135: * This method initializes a new <code>StreamTokenizer</code> to read 136: * characters from a <code>Reader</code> and parse them. The char values 137: * have their hight bits masked so that the value is treated a character 138: * in the range of 0x0000 to 0x00FF. 139: * <p> 140: * This constructor sets up the parsing table to parse the stream in the 141: * following manner: 142: * <ul> 143: * <li>The values 'A' through 'Z', 'a' through 'z' and 0xA0 through 0xFF 144: * are initialized as alphabetic</li> 145: * <li>The values 0x00 through 0x20 are initialized as whitespace</li> 146: * <li>The values '\'' and '"' are initialized as quote characters</li> 147: * <li>'/' is a comment character</li> 148: * <li>Numbers will be parsed</li> 149: * <li>EOL is not treated as significant</li> 150: * <li>C and C++ (//) comments are not recognized</li> 151: * </ul> 152: * 153: * @param r The <code>Reader</code> to read chars from 154: */ 155: public StreamTokenizer(Reader r) 156: { 157: in = new PushbackReader(r); 158: 159: whitespaceChars(0x00, 0x20); 160: wordChars('A', 'Z'); 161: wordChars('a', 'z'); 162: wordChars(0xA0, 0xFF); 163: commentChar('/'); 164: quoteChar('\''); 165: quoteChar('"'); 166: parseNumbers(); 167: } 168: 169: /** 170: * This method sets the comment attribute on the specified 171: * character. Other attributes for the character are cleared. 172: * 173: * @param ch The character to set the comment attribute for, passed as an int 174: */ 175: public void commentChar(int ch) 176: { 177: if (ch >= 0 && ch <= 255) 178: { 179: comment[ch] = true; 180: whitespace[ch] = false; 181: alphabetic[ch] = false; 182: numeric[ch] = false; 183: quote[ch] = false; 184: } 185: } 186: 187: /** 188: * This method sets a flag that indicates whether or not the end of line 189: * sequence terminates and is a token. The defaults to <code>false</code> 190: * 191: * @param flag <code>true</code> if EOF is significant, <code>false</code> 192: * otherwise 193: */ 194: public void eolIsSignificant(boolean flag) 195: { 196: eolSignificant = flag; 197: } 198: 199: /** 200: * This method returns the current line number. Note that if the 201: * <code>pushBack()</code> method is called, it has no effect on the 202: * line number returned by this method. 203: * 204: * @return The current line number 205: */ 206: public int lineno() 207: { 208: return lineNumber; 209: } 210: 211: /** 212: * This method sets a flag that indicates whether or not alphabetic 213: * tokens that are returned should be converted to lower case. 214: * 215: * @param flag <code>true</code> to convert to lower case, 216: * <code>false</code> otherwise 217: */ 218: public void lowerCaseMode(boolean flag) 219: { 220: lowerCase = flag; 221: } 222: 223: private boolean isWhitespace(int ch) 224: { 225: return (ch >= 0 && ch <= 255 && whitespace[ch]); 226: } 227: 228: private boolean isAlphabetic(int ch) 229: { 230: return ((ch > 255) || (ch >= 0 && alphabetic[ch])); 231: } 232: 233: private boolean isNumeric(int ch) 234: { 235: return (ch >= 0 && ch <= 255 && numeric[ch]); 236: } 237: 238: private boolean isQuote(int ch) 239: { 240: return (ch >= 0 && ch <= 255 && quote[ch]); 241: } 242: 243: private boolean isComment(int ch) 244: { 245: return (ch >= 0 && ch <= 255 && comment[ch]); 246: } 247: 248: /** 249: * This method reads the next token from the stream. It sets the 250: * <code>ttype</code> variable to the appropriate token type and 251: * returns it. It also can set <code>sval</code> or <code>nval</code> 252: * as described below. The parsing strategy is as follows: 253: * <ul> 254: * <li>Skip any whitespace characters.</li> 255: * <li>If a numeric character is encountered, attempt to parse a numeric 256: * value. Leading '-' characters indicate a numeric only if followed by 257: * another non-'-' numeric. The value of the numeric token is terminated 258: * by either the first non-numeric encountered, or the second occurrence of 259: * '-' or '.'. The token type returned is TT_NUMBER and <code>nval</code> 260: * is set to the value parsed.</li> 261: * <li>If an alphabetic character is parsed, all subsequent characters 262: * are read until the first non-alphabetic or non-numeric character is 263: * encountered. The token type returned is TT_WORD and the value parsed 264: * is stored in <code>sval</code>. If lower case mode is set, the token 265: * stored in <code>sval</code> is converted to lower case. The end of line 266: * sequence terminates a word only if EOL signficance has been turned on. 267: * The start of a comment also terminates a word. Any character with a 268: * non-alphabetic and non-numeric attribute (such as white space, a quote, 269: * or a commet) are treated as non-alphabetic and terminate the word.</li> 270: * <li>If a comment character is parsed, then all remaining characters on 271: * the current line are skipped and another token is parsed. Any EOL or 272: * EOF's encountered are not discarded, but rather terminate the comment.</li> 273: * <li>If a quote character is parsed, then all characters up to the 274: * second occurrence of the same quote character are parsed into a 275: * <code>String</code>. This <code>String</code> is stored as 276: * <code>sval</code>, but is not converted to lower case, even if lower case 277: * mode is enabled. The token type returned is the value of the quote 278: * character encountered. Any escape sequences 279: * (\b (backspace), \t (HTAB), \n (linefeed), \f (form feed), \r 280: * (carriage return), \" (double quote), \' (single quote), \\ 281: * (backslash), \XXX (octal esacpe)) are converted to the appropriate 282: * char values. Invalid esacape sequences are left in untranslated. 283: * Unicode characters like ('\ u0000') are not recognized. </li> 284: * <li>If the C++ comment sequence "//" is encountered, and the parser 285: * is configured to handle that sequence, then the remainder of the line 286: * is skipped and another token is read exactly as if a character with 287: * the comment attribute was encountered.</li> 288: * <li>If the C comment sequence "/*" is encountered, and the parser 289: * is configured to handle that sequence, then all characters up to and 290: * including the comment terminator sequence are discarded and another 291: * token is parsed.</li> 292: * <li>If all cases above are not met, then the character is an ordinary 293: * character that is parsed as a token by itself. The char encountered 294: * is returned as the token type.</li> 295: * </ul> 296: * 297: * @return The token type 298: * @exception IOException If an I/O error occurs 299: */ 300: public int nextToken() throws IOException 301: { 302: if (pushedBack) 303: { 304: pushedBack = false; 305: if (ttype != TT_NONE) 306: return ttype; 307: } 308: 309: sval = null; 310: int ch; 311: 312: // Skip whitespace. Deal with EOL along the way. 313: while (isWhitespace(ch = in.read())) 314: if (ch == '\n' || ch == '\r') 315: { 316: lineNumber++; 317: 318: // Throw away \n if in combination with \r. 319: if (ch == '\r' && (ch = in.read()) != '\n') 320: { 321: if (ch != TT_EOF) 322: in.unread(ch); 323: } 324: if (eolSignificant) 325: return (ttype = TT_EOL); 326: } 327: 328: if (ch == '/') 329: if ((ch = in.read()) == '/' && slashSlash) 330: { 331: while ((ch = in.read()) != '\n' && ch != '\r' && ch != TT_EOF) 332: ; 333: if (ch != TT_EOF) 334: in.unread(ch); 335: return nextToken(); // Recursive, but not too deep in normal cases 336: } 337: else if (ch == '*' && slashStar) 338: { 339: while (true) 340: { 341: ch = in.read(); 342: if (ch == '*') 343: { 344: if ((ch = in.read()) == '/') 345: break; 346: else if (ch != TT_EOF) 347: in.unread(ch); 348: } 349: else if (ch == '\n' || ch == '\r') 350: { 351: lineNumber++; 352: if (ch == '\r' && (ch = in.read()) != '\n') 353: { 354: if (ch != TT_EOF) 355: in.unread(ch); 356: } 357: } 358: else if (ch == TT_EOF) 359: { 360: break; 361: } 362: } 363: return nextToken(); // Recursive, but not too deep in normal cases 364: } 365: else 366: { 367: if (ch != TT_EOF) 368: in.unread(ch); 369: ch = '/'; 370: } 371: 372: if (ch == TT_EOF) 373: ttype = TT_EOF; 374: else if (isNumeric(ch)) 375: { 376: boolean isNegative = false; 377: if (ch == '-') 378: { 379: // Read ahead to see if this is an ordinary '-' rather than numeric. 380: ch = in.read(); 381: if (isNumeric(ch) && ch != '-') 382: { 383: isNegative = true; 384: } 385: else 386: { 387: if (ch != TT_EOF) 388: in.unread(ch); 389: return (ttype = '-'); 390: } 391: } 392: 393: StringBuffer tokbuf = new StringBuffer(); 394: tokbuf.append((char) ch); 395: 396: int decCount = 0; 397: while (isNumeric(ch = in.read()) && ch != '-') 398: if (ch == '.' && decCount++ > 0) 399: break; 400: else 401: tokbuf.append((char) ch); 402: 403: if (ch != TT_EOF) 404: in.unread(ch); 405: ttype = TT_NUMBER; 406: try 407: { 408: nval = Double.valueOf(tokbuf.toString()).doubleValue(); 409: } 410: catch (NumberFormatException _) 411: { 412: nval = 0.0; 413: } 414: if (isNegative) 415: nval = -nval; 416: } 417: else if (isAlphabetic(ch)) 418: { 419: StringBuffer tokbuf = new StringBuffer(); 420: tokbuf.append((char) ch); 421: while (isAlphabetic(ch = in.read()) || isNumeric(ch)) 422: tokbuf.append((char) ch); 423: if (ch != TT_EOF) 424: in.unread(ch); 425: ttype = TT_WORD; 426: sval = tokbuf.toString(); 427: if (lowerCase) 428: sval = sval.toLowerCase(); 429: } 430: else if (isComment(ch)) 431: { 432: while ((ch = in.read()) != '\n' && ch != '\r' && ch != TT_EOF) 433: ; 434: if (ch != TT_EOF) 435: in.unread(ch); 436: return nextToken(); // Recursive, but not too deep in normal cases. 437: } 438: else if (isQuote(ch)) 439: { 440: ttype = ch; 441: StringBuffer tokbuf = new StringBuffer(); 442: while ((ch = in.read()) != ttype && ch != '\n' && ch != '\r' && 443: ch != TT_EOF) 444: { 445: if (ch == '\\') 446: switch (ch = in.read()) 447: { 448: case 'a': ch = 0x7; 449: break; 450: case 'b': ch = '\b'; 451: break; 452: case 'f': ch = 0xC; 453: break; 454: case 'n': ch = '\n'; 455: break; 456: case 'r': ch = '\r'; 457: break; 458: case 't': ch = '\t'; 459: break; 460: case 'v': ch = 0xB; 461: break; 462: case '\n': ch = '\n'; 463: break; 464: case '\r': ch = '\r'; 465: break; 466: case '\"': 467: case '\'': 468: case '\\': 469: break; 470: default: 471: int ch1, nextch; 472: if ((nextch = ch1 = ch) >= '0' && ch <= '7') 473: { 474: ch -= '0'; 475: if ((nextch = in.read()) >= '0' && nextch <= '7') 476: { 477: ch = ch * 8 + nextch - '0'; 478: if ((nextch = in.read()) >= '0' && nextch <= '7' && 479: ch1 >= '0' && ch1 <= '3') 480: { 481: ch = ch * 8 + nextch - '0'; 482: nextch = in.read(); 483: } 484: } 485: } 486: 487: if (nextch != TT_EOF) 488: in.unread(nextch); 489: } 490: 491: tokbuf.append((char) ch); 492: } 493: 494: // Throw away matching quote char. 495: if (ch != ttype && ch != TT_EOF) 496: in.unread(ch); 497: 498: sval = tokbuf.toString(); 499: } 500: else 501: { 502: ttype = ch; 503: } 504: 505: return ttype; 506: } 507: 508: private void resetChar(int ch) 509: { 510: whitespace[ch] = alphabetic[ch] = numeric[ch] = quote[ch] = comment[ch] = 511: false; 512: } 513: 514: /** 515: * This method makes the specified character an ordinary character. This 516: * means that none of the attributes (whitespace, alphabetic, numeric, 517: * quote, or comment) will be set on this character. This character will 518: * parse as its own token. 519: * 520: * @param ch The character to make ordinary, passed as an int 521: */ 522: public void ordinaryChar(int ch) 523: { 524: if (ch >= 0 && ch <= 255) 525: resetChar(ch); 526: } 527: 528: /** 529: * This method makes all the characters in the specified range, range 530: * terminators included, ordinary. This means the none of the attributes 531: * (whitespace, alphabetic, numeric, quote, or comment) will be set on 532: * any of the characters in the range. This makes each character in this 533: * range parse as its own token. 534: * 535: * @param low The low end of the range of values to set the whitespace 536: * attribute for 537: * @param hi The high end of the range of values to set the whitespace 538: * attribute for 539: */ 540: public void ordinaryChars(int low, int hi) 541: { 542: if (low < 0) 543: low = 0; 544: if (hi > 255) 545: hi = 255; 546: for (int i = low; i <= hi; i++) 547: resetChar(i); 548: } 549: 550: /** 551: * This method sets the numeric attribute on the characters '0' - '9' and 552: * the characters '.' and '-'. 553: * When this method is used, the result of giving other attributes 554: * (whitespace, quote, or comment) to the numeric characters may 555: * vary depending on the implementation. For example, if 556: * parseNumbers() and then whitespaceChars('1', '1') are called, 557: * this implementation reads "121" as 2, while some other implementation 558: * will read it as 21. 559: */ 560: public void parseNumbers() 561: { 562: for (int i = 0; i <= 9; i++) 563: numeric['0' + i] = true; 564: 565: numeric['.'] = true; 566: numeric['-'] = true; 567: } 568: 569: /** 570: * Puts the current token back into the StreamTokenizer so 571: * <code>nextToken</code> will return the same value on the next call. 572: * May cause the lineno method to return an incorrect value 573: * if lineno is called before the next call to nextToken. 574: */ 575: public void pushBack() 576: { 577: pushedBack = true; 578: } 579: 580: /** 581: * This method sets the quote attribute on the specified character. 582: * Other attributes for the character are cleared. 583: * 584: * @param ch The character to set the quote attribute for, passed as an int. 585: */ 586: public void quoteChar(int ch) 587: { 588: if (ch >= 0 && ch <= 255) 589: { 590: quote[ch] = true; 591: comment[ch] = false; 592: whitespace[ch] = false; 593: alphabetic[ch] = false; 594: numeric[ch] = false; 595: } 596: } 597: 598: /** 599: * This method removes all attributes (whitespace, alphabetic, numeric, 600: * quote, and comment) from all characters. It is equivalent to calling 601: * <code>ordinaryChars(0x00, 0xFF)</code>. 602: * 603: * @see #ordinaryChars(int, int) 604: */ 605: public void resetSyntax() 606: { 607: ordinaryChars(0x00, 0xFF); 608: } 609: 610: /** 611: * This method sets a flag that indicates whether or not "C++" language style 612: * comments ("//" comments through EOL ) are handled by the parser. 613: * If this is <code>true</code> commented out sequences are skipped and 614: * ignored by the parser. This defaults to <code>false</code>. 615: * 616: * @param flag <code>true</code> to recognized and handle "C++" style 617: * comments, <code>false</code> otherwise 618: */ 619: public void slashSlashComments(boolean flag) 620: { 621: slashSlash = flag; 622: } 623: 624: /** 625: * This method sets a flag that indicates whether or not "C" language style 626: * comments (with nesting not allowed) are handled by the parser. 627: * If this is <code>true</code> commented out sequences are skipped and 628: * ignored by the parser. This defaults to <code>false</code>. 629: * 630: * @param flag <code>true</code> to recognized and handle "C" style comments, 631: * <code>false</code> otherwise 632: */ 633: public void slashStarComments(boolean flag) 634: { 635: slashStar = flag; 636: } 637: 638: /** 639: * This method returns the current token value as a <code>String</code> in 640: * the form "Token[x], line n", where 'n' is the current line numbers and 641: * 'x' is determined as follows. 642: * <p> 643: * <ul> 644: * <li>If no token has been read, then 'x' is "NOTHING" and 'n' is 0</li> 645: * <li>If <code>ttype</code> is TT_EOF, then 'x' is "EOF"</li> 646: * <li>If <code>ttype</code> is TT_EOL, then 'x' is "EOL"</li> 647: * <li>If <code>ttype</code> is TT_WORD, then 'x' is <code>sval</code></li> 648: * <li>If <code>ttype</code> is TT_NUMBER, then 'x' is "n=strnval" where 649: * 'strnval' is <code>String.valueOf(nval)</code>.</li> 650: * <li>If <code>ttype</code> is a quote character, then 'x' is 651: * <code>sval</code></li> 652: * <li>For all other cases, 'x' is <code>ttype</code></li> 653: * </ul> 654: */ 655: public String toString() 656: { 657: String tempstr; 658: if (ttype == TT_EOF) 659: tempstr = "EOF"; 660: else if (ttype == TT_EOL) 661: tempstr = "EOL"; 662: else if (ttype == TT_WORD) 663: tempstr = sval; 664: else if (ttype == TT_NUMBER) 665: tempstr = "n=" + nval; 666: else if (ttype == TT_NONE) 667: tempstr = "NOTHING"; 668: else // must be an ordinary char. 669: tempstr = "\'" + (char) ttype + "\'"; 670: 671: return "Token[" + tempstr + "], line " + lineno(); 672: } 673: 674: /** 675: * This method sets the whitespace attribute for all characters in the 676: * specified range, range terminators included. 677: * 678: * @param low The low end of the range of values to set the whitespace 679: * attribute for 680: * @param hi The high end of the range of values to set the whitespace 681: * attribute for 682: */ 683: public void whitespaceChars(int low, int hi) 684: { 685: if (low < 0) 686: low = 0; 687: if (hi > 255) 688: hi = 255; 689: for (int i = low; i <= hi; i++) 690: { 691: resetChar(i); 692: whitespace[i] = true; 693: } 694: } 695: 696: /** 697: * This method sets the alphabetic attribute for all characters in the 698: * specified range, range terminators included. 699: * 700: * @param low The low end of the range of values to set the alphabetic 701: * attribute for 702: * @param hi The high end of the range of values to set the alphabetic 703: * attribute for 704: */ 705: public void wordChars(int low, int hi) 706: { 707: if (low < 0) 708: low = 0; 709: if (hi > 255) 710: hi = 255; 711: for (int i = low; i <= hi; i++) 712: alphabetic[i] = true; 713: } 714: }