001// XmlParser.java: the main parser class. 002// NO WARRANTY! See README, and copyright below. 003// $Id$ 004package com.microstar.xml; 005 006import java.io.BufferedInputStream; 007import java.io.EOFException; 008import java.io.IOException; 009import java.io.InputStream; 010import java.io.Reader; 011import java.net.URL; 012import java.net.URLConnection; 013import java.util.Enumeration; 014import java.util.Hashtable; 015import java.util.Locale; 016import java.util.Stack; 017 018/** 019 * Parse XML documents and return parse events through call-backs. 020 * <p>You need to define a class implementing the <code>XmlHandler</code> 021 * interface: an object belonging to this class will receive the 022 * callbacks for the events. (As an alternative to implementing 023 * the full XmlHandler interface, you can simply extend the 024 * <code>HandlerBase</code> convenience class.) 025 * <p>Usage (assuming that <code>MyHandler</code> is your implementation 026 * of the <code>XmlHandler</code> interface): 027 * <pre> 028 * XmlHandler handler = new MyHandler(); 029 * XmlParser parser = new XmlParser(); 030 * parser.setHandler(handler); 031 * try { 032 * parser.parse("http://www.host.com/doc.xml", null); 033 * } catch (Exception e) { 034 * [do something interesting] 035 * } 036 * </pre> 037 * <p>Alternatively, you can use the standard SAX interfaces 038 * with the <code>SAXDriver</code> class as your entry point. 039 * @author Copyright (c) 1997, 1998 by Microstar Software Ltd. 040 * @author Written by David Megginson <dmeggins@microstar.com> 041 * @version 1.1 042 * @since Ptolemy II 0.2 043 * @see XmlHandler 044 * @see HandlerBase 045 */ 046public class XmlParser { 047 // 048 // Use special cheats that speed up the code (currently about 50%), 049 // but may cause problems with future maintenance and add to the 050 // class file size (about 500 bytes). 051 // 052 private final static boolean USE_CHEATS = true; 053 054 ////////////////////////////////////////////////////////////////////// 055 // Constructors. 056 //////////////////////////////////////////////////////////////////////// 057 058 /** 059 * Construct a new parser with no associated handler. 060 * @see #setHandler 061 * @see #parse 062 */ 063 public XmlParser() { 064 } 065 066 /** 067 * Set the handler that will receive parsing events. 068 * @param handler The handler to receive callback events. 069 * @see #parse 070 * @see XmlHandler 071 */ 072 public void setHandler(XmlHandler handler) { 073 this.handler = handler; 074 } 075 076 /** 077 * Parse an XML document from a URI. 078 * <p>You may parse a document more than once, but only one thread 079 * may call this method for an object at one time. 080 * @param systemId The URI of the document. 081 * @param publicId The public identifier of the document, or null. 082 * @param encoding The suggested encoding, or null if unknown. 083 * @exception java.lang.Exception Any exception thrown by your 084 * own handlers, or any derivation of java.io.IOException 085 * thrown by the parser itself. 086 */ 087 public void parse(String systemId, String publicId, String encoding) 088 throws java.lang.Exception { 089 doParse(systemId, publicId, null, null, encoding); 090 } 091 092 /** 093 * Parse an XML document from a byte stream. 094 * <p>The URI that you supply will become the base URI for 095 * resolving relative links, but Ælfred will actually read 096 * the document from the supplied input stream. 097 * <p>You may parse a document more than once, but only one thread 098 * may call this method for an object at one time. 099 * @param systemId The base URI of the document, or null if not 100 * known. 101 * @param publicId The public identifier of the document, or null 102 * if not known. 103 * @param stream A byte input stream. 104 * @param encoding The suggested encoding, or null if unknown. 105 * @exception java.lang.Exception Any exception thrown by your 106 * own handlers, or any derivation of java.io.IOException 107 * thrown by the parser itself. 108 */ 109 public void parse(String systemId, String publicId, InputStream stream, 110 String encoding) throws java.lang.Exception { 111 doParse(systemId, publicId, null, stream, encoding); 112 } 113 114 /** 115 * Parse an XML document from a character stream. 116 * <p>The URI that you supply will become the base URI for 117 * resolving relative links, but Ælfred will actually read 118 * the document from the supplied input stream. 119 * <p>You may parse a document more than once, but only one thread 120 * may call this method for an object at one time. 121 * @param systemId The base URI of the document, or null if not 122 * known. 123 * @param publicId The public identifier of the document, or null 124 * if not known. 125 * @param reader A character stream. 126 * @exception java.lang.Exception Any exception thrown by your 127 * own handlers, or any derivation of java.io.IOException 128 * thrown by the parser itself. 129 */ 130 public void parse(String systemId, String publicId, Reader reader) 131 throws java.lang.Exception { 132 doParse(systemId, publicId, reader, null, null); 133 } 134 135 private synchronized void doParse(String systemId, String publicId, 136 Reader reader, InputStream stream, String encoding) 137 throws java.lang.Exception { 138 basePublicId = publicId; 139 baseURI = systemId; 140 baseReader = reader; 141 baseInputStream = stream; 142 143 initializeVariables(); 144 145 // Set the default entities here. 146 setInternalEntity(intern("amp"), "&"); 147 setInternalEntity(intern("lt"), "<"); 148 setInternalEntity(intern("gt"), ">"); 149 setInternalEntity(intern("apos"), "'"); 150 setInternalEntity(intern("quot"), """); 151 152 if (handler != null) { 153 handler.startDocument(); 154 } 155 156 pushURL("[document]", basePublicId, baseURI, baseReader, 157 baseInputStream, encoding); 158 159 parseDocument(); 160 161 if (handler != null) { 162 handler.endDocument(); 163 } 164 165 cleanupVariables(); 166 } 167 168 //////////////////////////////////////////////////////////////////////// 169 // Constants. 170 //////////////////////////////////////////////////////////////////////// 171 // 172 // Constants for element content type. 173 // 174 175 /** 176 * Constant: an element has not been declared. 177 * @see #getElementContentType 178 */ 179 public final static int CONTENT_UNDECLARED = 0; 180 181 /** 182 * Constant: the element has a content model of ANY. 183 * @see #getElementContentType 184 */ 185 public final static int CONTENT_ANY = 1; 186 187 /** 188 * Constant: the element has declared content of EMPTY. 189 * @see #getElementContentType 190 */ 191 public final static int CONTENT_EMPTY = 2; 192 193 /** 194 * Constant: the element has mixed content. 195 * @see #getElementContentType 196 */ 197 public final static int CONTENT_MIXED = 3; 198 199 /** 200 * Constant: the element has element content. 201 * @see #getElementContentType 202 */ 203 public final static int CONTENT_ELEMENTS = 4; 204 205 // 206 // Constants for the entity type. 207 // 208 209 /** 210 * Constant: the entity has not been declared. 211 * @see #getEntityType 212 */ 213 public final static int ENTITY_UNDECLARED = 0; 214 215 /** 216 * Constant: the entity is internal. 217 * @see #getEntityType 218 */ 219 public final static int ENTITY_INTERNAL = 1; 220 221 /** 222 * Constant: the entity is external, non-XML data. 223 * @see #getEntityType 224 */ 225 public final static int ENTITY_NDATA = 2; 226 227 /** 228 * Constant: the entity is external XML data. 229 * @see #getEntityType 230 */ 231 public final static int ENTITY_TEXT = 3; 232 233 // 234 // Constants for attribute type. 235 // 236 237 /** 238 * Constant: the attribute has not been declared for this element type. 239 * @see #getAttributeType 240 */ 241 public final static int ATTRIBUTE_UNDECLARED = 0; 242 243 /** 244 * Constant: the attribute value is a string value. 245 * @see #getAttributeType 246 */ 247 public final static int ATTRIBUTE_CDATA = 1; 248 249 /** 250 * Constant: the attribute value is a unique identifier. 251 * @see #getAttributeType 252 */ 253 public final static int ATTRIBUTE_ID = 2; 254 255 /** 256 * Constant: the attribute value is a reference to a unique identifier. 257 * @see #getAttributeType 258 */ 259 public final static int ATTRIBUTE_IDREF = 3; 260 261 /** 262 * Constant: the attribute value is a list of ID references. 263 * @see #getAttributeType 264 */ 265 public final static int ATTRIBUTE_IDREFS = 4; 266 267 /** 268 * Constant: the attribute value is the name of an entity. 269 * @see #getAttributeType 270 */ 271 public final static int ATTRIBUTE_ENTITY = 5; 272 273 /** 274 * Constant: the attribute value is a list of entity names. 275 * @see #getAttributeType 276 */ 277 public final static int ATTRIBUTE_ENTITIES = 6; 278 279 /** 280 * Constant: the attribute value is a name token. 281 * @see #getAttributeType 282 */ 283 public final static int ATTRIBUTE_NMTOKEN = 7; 284 285 /** 286 * Constant: the attribute value is a list of name tokens. 287 * @see #getAttributeType 288 */ 289 public final static int ATTRIBUTE_NMTOKENS = 8; 290 291 /** 292 * Constant: the attribute value is a token from an enumeration. 293 * @see #getAttributeType 294 */ 295 public final static int ATTRIBUTE_ENUMERATED = 9; 296 297 /** 298 * Constant: the attribute is the name of a notation. 299 * @see #getAttributeType 300 */ 301 public final static int ATTRIBUTE_NOTATION = 10; 302 303 // 304 // When the class is loaded, populate the hash table of 305 // attribute types. 306 // 307 308 /** 309 * Hash table of attribute types. 310 */ 311 private static Hashtable attributeTypeHash; 312 313 static { 314 attributeTypeHash = new Hashtable(); 315 attributeTypeHash.put("CDATA", Integer.valueOf(ATTRIBUTE_CDATA)); 316 attributeTypeHash.put("ID", Integer.valueOf(ATTRIBUTE_ID)); 317 attributeTypeHash.put("IDREF", Integer.valueOf(ATTRIBUTE_IDREF)); 318 attributeTypeHash.put("IDREFS", Integer.valueOf(ATTRIBUTE_IDREFS)); 319 attributeTypeHash.put("ENTITY", Integer.valueOf(ATTRIBUTE_ENTITY)); 320 attributeTypeHash.put("ENTITIES", Integer.valueOf(ATTRIBUTE_ENTITIES)); 321 attributeTypeHash.put("NMTOKEN", Integer.valueOf(ATTRIBUTE_NMTOKEN)); 322 attributeTypeHash.put("NMTOKENS", Integer.valueOf(ATTRIBUTE_NMTOKENS)); 323 attributeTypeHash.put("NOTATION", Integer.valueOf(ATTRIBUTE_NOTATION)); 324 } 325 326 // 327 // Constants for supported encodings. 328 // 329 private final static int ENCODING_UTF_8 = 1; 330 331 private final static int ENCODING_ISO_8859_1 = 2; 332 333 private final static int ENCODING_UCS_2_12 = 3; 334 335 private final static int ENCODING_UCS_2_21 = 4; 336 337 private final static int ENCODING_UCS_4_1234 = 5; 338 339 private final static int ENCODING_UCS_4_4321 = 6; 340 341 private final static int ENCODING_UCS_4_2143 = 7; 342 343 private final static int ENCODING_UCS_4_3412 = 8; 344 345 // 346 // Constants for attribute default value. 347 // 348 349 /** 350 * Constant: the attribute is not declared. 351 * @see #getAttributeDefaultValueType 352 */ 353 public final static int ATTRIBUTE_DEFAULT_UNDECLARED = 0; 354 355 /** 356 * Constant: the attribute has a literal default value specified. 357 * @see #getAttributeDefaultValueType 358 * @see #getAttributeDefaultValue 359 */ 360 public final static int ATTRIBUTE_DEFAULT_SPECIFIED = 1; 361 362 /** 363 * Constant: the attribute was declared #IMPLIED. 364 * @see #getAttributeDefaultValueType 365 */ 366 public final static int ATTRIBUTE_DEFAULT_IMPLIED = 2; 367 368 /** 369 * Constant: the attribute was declared #REQUIRED. 370 * @see #getAttributeDefaultValueType 371 */ 372 public final static int ATTRIBUTE_DEFAULT_REQUIRED = 3; 373 374 /** 375 * Constant: the attribute was declared #FIXED. 376 * @see #getAttributeDefaultValueType 377 * @see #getAttributeDefaultValue 378 */ 379 public final static int ATTRIBUTE_DEFAULT_FIXED = 4; 380 381 // 382 // Constants for input. 383 // 384 private final static int INPUT_NONE = 0; 385 386 private final static int INPUT_INTERNAL = 1; 387 388 private final static int INPUT_EXTERNAL = 2; 389 390 private final static int INPUT_STREAM = 3; 391 392 private final static int INPUT_BUFFER = 4; 393 394 private final static int INPUT_READER = 5; 395 396 // 397 // Flags for reading literals. 398 // 399 private final static int LIT_CHAR_REF = 1; 400 401 private final static int LIT_ENTITY_REF = 2; 402 403 private final static int LIT_PE_REF = 4; 404 405 private final static int LIT_NORMALIZE = 8; 406 407 // 408 // Flags for parsing context. 409 // 410 private final static int CONTEXT_NONE = 0; 411 412 private final static int CONTEXT_DTD = 1; 413 414 private final static int CONTEXT_ENTITYVALUE = 2; 415 416 private final static int CONTEXT_ATTRIBUTEVALUE = 3; 417 418 ////////////////////////////////////////////////////////////////////// 419 // Error reporting. 420 ////////////////////////////////////////////////////////////////////// 421 422 /** 423 * Report an error. 424 * @param message The error message. 425 * @param textFound The text that caused the error (or null). 426 * @see XmlHandler#error 427 * @see #line 428 */ 429 void error(String message, String textFound, String textExpected) 430 throws java.lang.Exception { 431 //errorCount++; 432 433 if (textFound != null) { 434 message = message + " (found \"" + textFound + "\")"; 435 } 436 437 if (textExpected != null) { 438 message = message + " (expected \"" + textExpected + "\")"; 439 } 440 441 if (handler != null) { 442 String uri = null; 443 444 if (externalEntity != null) { 445 uri = externalEntity.getURL().toString(); 446 } 447 448 handler.error(message, uri, line, column); 449 } 450 } 451 452 /** 453 * Report a serious error. 454 * @param message The error message. 455 * @param textFound The text that caused the error (or null). 456 */ 457 void error(String message, char textFound, String textExpected) 458 throws java.lang.Exception { 459 error(message, Character.toString(textFound), textExpected); 460 } 461 462 ////////////////////////////////////////////////////////////////////// 463 // Major syntactic productions. 464 ////////////////////////////////////////////////////////////////////// 465 466 /** 467 * Parse an XML document. 468 * <pre> 469 * [1] document ::= prolog element Misc* 470 * </pre> 471 * <p>This is the top-level parsing function for a single XML 472 * document. As a minimum, a well-formed document must have 473 * a document element, and a valid document must have a prolog 474 * as well. 475 */ 476 void parseDocument() throws java.lang.Exception { 477 char c; 478 479 parseProlog(); 480 require('<'); 481 parseElement(); 482 483 try { 484 parseMisc(); //skip all white, PIs, and comments 485 c = readCh(); //if this doesn't throw an exception... 486 error("unexpected characters after document end", c, null); 487 } catch (EOFException e) { 488 return; 489 } 490 } 491 492 /** 493 * Skip a comment. 494 * <pre> 495 * [18] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* "-->" 496 * </pre> 497 * <p>(The <code><!--</code> has already been read.) 498 */ 499 void parseComment() throws java.lang.Exception { 500 skipUntil("-->"); 501 } 502 503 /** 504 * Parse a processing instruction and do a call-back. 505 * <pre> 506 * [19] PI ::= '<?' Name (S (Char* - (Char* '?>' Char*)))? '?>' 507 * </pre> 508 * <p>(The <code><?</code> has already been read.) 509 * <p>An XML processing instruction <em>must</em> begin with 510 * a Name, which is the instruction's target. 511 */ 512 void parsePI() throws java.lang.Exception { 513 String name; 514 515 name = readNmtoken(true); 516 517 if (!tryRead("?>")) { 518 requireWhitespace(); 519 parseUntil("?>"); 520 } 521 522 if (handler != null) { 523 handler.processingInstruction(name, dataBufferToString()); 524 } 525 } 526 527 /** 528 * Parse a CDATA marked section. 529 * <pre> 530 * [20] CDSect ::= CDStart CData CDEnd 531 * [21] CDStart ::= '<![CDATA[' 532 * [22] CData ::= (Char* - (Char* ']]>' Char*)) 533 * [23] CDEnd ::= ']]>' 534 * </pre> 535 * <p>(The '<![CDATA[' has already been read.) 536 * <p>Note that this just appends characters to the dataBuffer, 537 * without actually generating an event. 538 */ 539 void parseCDSect() throws java.lang.Exception { 540 parseUntil("]]>"); 541 } 542 543 /** 544 * Parse the prolog of an XML document. 545 * <pre> 546 * [24] prolog ::= XMLDecl? Misc* (Doctypedecl Misc*)? 547 * </pre> 548 * <p>There are a couple of tricks here. First, it is necessary to 549 * declare the XML default attributes after the DTD (if present) 550 * has been read. Second, it is not possible to expand general 551 * references in attribute value literals until after the entire 552 * DTD (if present) has been parsed. 553 * <p>We do not look for the XML declaration here, because it is 554 * handled by pushURL(). 555 * @see #pushURL 556 */ 557 void parseProlog() throws java.lang.Exception { 558 parseMisc(); 559 560 if (tryRead("<!DOCTYPE")) { 561 parseDoctypedecl(); 562 parseMisc(); 563 } 564 } 565 566 /** 567 * Parse the XML declaration. 568 * <pre> 569 * [25] XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>' 570 * [26] VersionInfo ::= S 'version' Eq ('"1.0"' | "'1.0'") 571 * [33] SDDecl ::= S 'standalone' Eq "'" ('yes' | 'no') "'" 572 * | S 'standalone' Eq '"' ("yes" | "no") '"' 573 * [78] EncodingDecl ::= S 'encoding' Eq QEncoding 574 * </pre> 575 * <p>([80] to [82] are also significant.) 576 * <p>(The <code><?xml</code> and whitespace have already been read.) 577 * <p>TODO: validate value of standalone. 578 * @see #parseTextDecl 579 * @see #checkEncoding 580 */ 581 void parseXMLDecl(boolean ignoreEncoding) throws java.lang.Exception { 582 String version; 583 String encodingName = null; 584 585 // String standalone = null; 586 // Read the version. 587 require("version"); 588 parseEq(); 589 version = readLiteral(0); 590 591 if (!version.equals("1.0")) { 592 error("unsupported XML version", version, "1.0"); 593 } 594 595 // Try reading an encoding declaration. 596 skipWhitespace(); 597 598 if (tryRead("encoding")) { 599 parseEq(); 600 encodingName = readLiteral(0); 601 checkEncoding(encodingName, ignoreEncoding); 602 } 603 604 // Try reading a standalone declaration 605 skipWhitespace(); 606 607 if (tryRead("standalone")) { 608 parseEq(); 609 610 // FIXME: Why is the literal read, but the value ignored? 611 /* standalone = */readLiteral(0); 612 } 613 614 skipWhitespace(); 615 require("?>"); 616 } 617 618 /** 619 * Parse the Encoding PI. 620 * <pre> 621 * [78] EncodingDecl ::= S 'encoding' Eq QEncoding 622 * [79] EncodingPI ::= '<?xml' S 'encoding' Eq QEncoding S? '?>' 623 * [80] QEncoding ::= '"' Encoding '"' | "'" Encoding "'" 624 * [81] Encoding ::= LatinName 625 * [82] LatinName ::= [A-Za-z] ([A-Za-z0-9._] | '-')* 626 * </pre> 627 * <p>(The <code><?xml</code>' and whitespace have already been read.) 628 * @see #parseXMLDecl 629 * @see #checkEncoding 630 */ 631 void parseTextDecl(boolean ignoreEncoding) throws java.lang.Exception { 632 String encodingName = null; 633 634 // Read an optional version. 635 if (tryRead("version")) { 636 String version; 637 parseEq(); 638 version = readLiteral(0); 639 640 if (!version.equals("1.0")) { 641 error("unsupported XML version", version, "1.0"); 642 } 643 644 requireWhitespace(); 645 } 646 647 // Read the encoding. 648 require("encoding"); 649 parseEq(); 650 encodingName = readLiteral(0); 651 checkEncoding(encodingName, ignoreEncoding); 652 653 skipWhitespace(); 654 require("?>"); 655 } 656 657 /** 658 * Check that the encoding specified makes sense. 659 * <p>Compare what the author has specified in the XML declaration 660 * or encoding PI with what we have detected. 661 * <p>This is also important for distinguishing among the various 662 * 7- and 8-bit encodings, such as ISO-LATIN-1 (I cannot autodetect 663 * those). 664 * @param encodingName The name of the encoding specified by the user. 665 * @see #parseXMLDecl 666 * @see #parseTextDecl 667 */ 668 void checkEncoding(String encodingName, boolean ignoreEncoding) 669 throws java.lang.Exception { 670 // FindBugs suggests using toUpperCase(Locale) 671 encodingName = encodingName.toUpperCase(Locale.getDefault()); 672 673 if (ignoreEncoding) { 674 return; 675 } 676 677 switch (encoding) { 678 // 8-bit encodings 679 case ENCODING_UTF_8: 680 681 if (encodingName.equals("ISO-8859-1")) { 682 encoding = ENCODING_ISO_8859_1; 683 } else if (!encodingName.equals("UTF-8")) { 684 error("unsupported 8-bit encoding", encodingName, 685 "UTF-8 or ISO-8859-1"); 686 } 687 688 break; 689 690 // 16-bit encodings 691 case ENCODING_UCS_2_12: 692 case ENCODING_UCS_2_21: 693 694 if (!encodingName.equals("ISO-10646-UCS-2") 695 && !encodingName.equals("UTF-16")) { 696 error("unsupported 16-bit encoding", encodingName, 697 "ISO-10646-UCS-2"); 698 } 699 700 break; 701 702 // 32-bit encodings 703 case ENCODING_UCS_4_1234: 704 case ENCODING_UCS_4_4321: 705 case ENCODING_UCS_4_2143: 706 case ENCODING_UCS_4_3412: 707 708 if (!encodingName.equals("ISO-10646-UCS-4")) { 709 error("unsupported 32-bit encoding", encodingName, 710 "ISO-10646-UCS-4"); 711 } 712 } 713 } 714 715 /** 716 * Parse miscellaneous markup outside the document element and DOCTYPE 717 * declaration. 718 * <pre> 719 * [27] Misc ::= Comment | PI | S 720 * </pre> 721 */ 722 void parseMisc() throws java.lang.Exception { 723 while (true) { 724 skipWhitespace(); 725 726 if (tryRead("<?")) { 727 parsePI(); 728 } else if (tryRead("<!--")) { 729 parseComment(); 730 } else { 731 return; 732 } 733 } 734 } 735 736 /** 737 * Parse a document type declaration. 738 * <pre> 739 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S? 740 * ('[' %markupdecl* ']' S?)? '>' 741 * </pre> 742 * <p>(The <code><!DOCTYPE</code> has already been read.) 743 */ 744 void parseDoctypedecl() throws java.lang.Exception { 745 String doctypeName; 746 String[] ids; 747 748 // Read the document type name. 749 requireWhitespace(); 750 doctypeName = readNmtoken(true); 751 752 // Read the ExternalIDs. 753 skipWhitespace(); 754 ids = readExternalIds(false); 755 756 // Look for a declaration subset. 757 skipWhitespace(); 758 759 if (tryRead('[')) { 760 // loop until the subset ends 761 while (true) { 762 context = CONTEXT_DTD; 763 skipWhitespace(); 764 context = CONTEXT_NONE; 765 766 if (tryRead(']')) { 767 break; // end of subset 768 } else { 769 context = CONTEXT_DTD; 770 parseMarkupdecl(); 771 context = CONTEXT_NONE; 772 } 773 } 774 } 775 776 // Read the external subset, if any 777 if (ids[1] != null) { 778 pushURL("[external subset]", ids[0], ids[1], null, null, null); 779 780 // Loop until we end up back at '>' 781 while (true) { 782 context = CONTEXT_DTD; 783 skipWhitespace(); 784 context = CONTEXT_NONE; 785 786 if (tryRead('>')) { 787 break; 788 } else { 789 context = CONTEXT_DTD; 790 parseMarkupdecl(); 791 context = CONTEXT_NONE; 792 } 793 } 794 } else { 795 // No external subset. 796 skipWhitespace(); 797 require('>'); 798 } 799 800 if (handler != null) { 801 handler.doctypeDecl(doctypeName, ids[0], ids[1]); 802 } 803 804 // Expand general entities in 805 // default values of attributes. 806 // (Do this after the doctypeDecl 807 // event!). 808 // expandAttributeDefaultValues(); 809 } 810 811 /** 812 * Parse a markup declaration in the internal or external DTD subset. 813 * <pre> 814 * [29] markupdecl ::= ( %elementdecl | %AttlistDecl | %EntityDecl | 815 * %NotationDecl | %PI | %S | %Comment | 816 * InternalPERef ) 817 * [30] InternalPERef ::= PEReference 818 * [31] extSubset ::= (%markupdecl | %conditionalSect)* 819 * </pre> 820 */ 821 void parseMarkupdecl() throws java.lang.Exception { 822 if (tryRead("<!ELEMENT")) { 823 parseElementdecl(); 824 } else if (tryRead("<!ATTLIST")) { 825 parseAttlistDecl(); 826 } else if (tryRead("<!ENTITY")) { 827 parseEntityDecl(); 828 } else if (tryRead("<!NOTATION")) { 829 parseNotationDecl(); 830 } else if (tryRead("<?")) { 831 parsePI(); 832 } else if (tryRead("<!--")) { 833 parseComment(); 834 } else if (tryRead("<![")) { 835 parseConditionalSect(); 836 } else { 837 error("expected markup declaration", null, null); 838 } 839 } 840 841 /** 842 * Parse an element, with its tags. 843 * <pre> 844 * [33] STag ::= '<' Name (S Attribute)* S? '>' [WFC: unique Att spec] 845 * [38] element ::= EmptyElement | STag content ETag 846 * [39] EmptyElement ::= '<' Name (S Attribute)* S? '/>' 847 * [WFC: unique Att spec] 848 * </pre> 849 * <p>(The '<' has already been read.) 850 * <p>NOTE: this method actually chains onto parseContent(), if necessary, 851 * and parseContent() will take care of calling parseETag(). 852 */ 853 void parseElement() throws java.lang.Exception { 854 String gi; 855 char c; 856 int oldElementContent = currentElementContent; 857 String oldElement = currentElement; 858 859 // This is the (global) counter for the 860 // array of specified attributes. 861 tagAttributePos = 0; 862 863 // Read the element type name. 864 gi = readNmtoken(true); 865 866 // Determine the current content type. 867 currentElement = gi; 868 currentElementContent = getElementContentType(gi); 869 870 if (currentElementContent == CONTENT_UNDECLARED) { 871 currentElementContent = CONTENT_ANY; 872 } 873 874 // Read the attributes, if any. 875 // After this loop, we should be just 876 // in front of the closing delimiter. 877 skipWhitespace(); 878 c = readCh(); 879 880 while (c != '/' && c != '>') { 881 unread(c); 882 parseAttribute(gi); 883 skipWhitespace(); 884 c = readCh(); 885 } 886 887 unread(c); 888 889 // Supply any defaulted attributes. 890 Enumeration atts = declaredAttributes(gi); 891 892 if (atts != null) { 893 String aname; 894 loop: while (atts.hasMoreElements()) { 895 aname = (String) atts.nextElement(); 896 897 // See if it was specified. 898 for (int i = 0; i < tagAttributePos; i++) { 899 if (tagAttributes[i].equals(aname)) { 900 continue loop; 901 } 902 } 903 904 // I guess not... 905 if (handler != null) { 906 handler.attribute(aname, 907 getAttributeExpandedValue(gi, aname), false); 908 } 909 } 910 } 911 912 // Figure out if this is a start tag 913 // or an empty element, and dispatch an 914 // event accordingly. 915 c = readCh(); 916 917 switch (c) { 918 case '>': 919 920 if (handler != null) { 921 handler.startElement(gi); 922 } 923 924 parseContent(); 925 break; 926 927 case '/': 928 require('>'); 929 930 if (handler != null) { 931 handler.startElement(gi); 932 handler.endElement(gi); 933 } 934 935 break; 936 } 937 938 // Restore the previous state. 939 currentElement = oldElement; 940 currentElementContent = oldElementContent; 941 } 942 943 /** 944 * Parse an attribute assignment. 945 * <pre> 946 * [34] Attribute ::= Name Eq AttValue 947 * </pre> 948 * @param name The name of the attribute's element. 949 * @see XmlHandler#attribute 950 */ 951 void parseAttribute(String name) throws java.lang.Exception { 952 String aname; 953 int type; 954 String value; 955 956 // Read the attribute name. 957 aname = readNmtoken(true).intern(); 958 959 // Fix by Zoltan Kemenczy for: 960 // "attribute value normalization according to Section 3.3.3 961 // Attribute-Value Normalization of XML 1.0 962 // http://www.w3.org/TR/2000/REC-xml-20001006#AVNormalize). It 963 // says that escaped whitespace character references that are not 964 // #x20 (like the newline,#xa) should be preserved in the 965 // normalized value)" 966 //type = getAttributeDefaultValueType(name, aname); 967 type = getAttributeType(name, aname); 968 969 // Parse '=' 970 parseEq(); 971 972 // Read the value, normalizing whitespace 973 // if it is not CDATA. 974 if (type == ATTRIBUTE_CDATA || type == ATTRIBUTE_UNDECLARED) { 975 value = readLiteral(LIT_CHAR_REF | LIT_ENTITY_REF); 976 } else { 977 value = readLiteral(LIT_CHAR_REF | LIT_ENTITY_REF | LIT_NORMALIZE); 978 } 979 980 // Inform the handler about the 981 // attribute. 982 if (handler != null) { 983 handler.attribute(aname, value, true); 984 } 985 986 dataBufferPos = 0; 987 988 // Note that the attribute has been 989 // specified. 990 if (tagAttributePos == tagAttributes.length) { 991 String[] newAttrib = new String[tagAttributes.length * 2]; 992 System.arraycopy(tagAttributes, 0, newAttrib, 0, tagAttributePos); 993 tagAttributes = newAttrib; 994 } 995 996 tagAttributes[tagAttributePos++] = aname; 997 } 998 999 /** 1000 * Parse an equals sign surrounded by optional whitespace. 1001 * [35] Eq ::= S? '=' S? 1002 */ 1003 void parseEq() throws java.lang.Exception { 1004 skipWhitespace(); 1005 require('='); 1006 skipWhitespace(); 1007 } 1008 1009 /** 1010 * Parse an end tag. 1011 * [36] ETag ::= '</' Name S? '>' 1012 * *NOTE: parseContent() chains to here. 1013 */ 1014 void parseETag() throws java.lang.Exception { 1015 String name; 1016 name = readNmtoken(true); 1017 1018 if (!name.equals(currentElement)) { 1019 error("mismatched end tag", name, currentElement); 1020 } 1021 1022 skipWhitespace(); 1023 require('>'); 1024 1025 if (handler != null) { 1026 handler.endElement(name); 1027 } 1028 } 1029 1030 /** 1031 * Parse the content of an element. 1032 * [37] content ::= (element | PCData | Reference | CDSect | PI | Comment)* 1033 * [68] Reference ::= EntityRef | CharRef 1034 */ 1035 void parseContent() throws java.lang.Exception { 1036 char c; 1037 1038 while (true) { 1039 switch (currentElementContent) { 1040 case CONTENT_ANY: 1041 case CONTENT_MIXED: 1042 parsePCData(); 1043 break; 1044 1045 case CONTENT_ELEMENTS: 1046 parseWhitespace(); 1047 break; 1048 } 1049 1050 // Handle delimiters 1051 c = readCh(); 1052 1053 switch (c) { 1054 case '&': // Found "&" 1055 c = readCh(); 1056 1057 if (c == '#') { 1058 parseCharRef(); 1059 } else { 1060 unread(c); 1061 parseEntityRef(true); 1062 } 1063 1064 break; 1065 1066 case '<': // Found "<" 1067 c = readCh(); 1068 1069 switch (c) { 1070 case '!': // Found "<!" 1071 c = readCh(); 1072 1073 switch (c) { 1074 case '-': // Found "<!-" 1075 require('-'); 1076 parseComment(); 1077 break; 1078 1079 case '[': // Found "<![" 1080 require("CDATA["); 1081 parseCDSect(); 1082 break; 1083 1084 default: 1085 error("expected comment or CDATA section", c, null); 1086 break; 1087 } 1088 1089 break; 1090 1091 case '?': // Found "<?" 1092 dataBufferFlush(); 1093 parsePI(); 1094 break; 1095 1096 case '/': // Found "</" 1097 dataBufferFlush(); 1098 parseETag(); 1099 return; 1100 1101 default: // Found "<" followed by something else 1102 dataBufferFlush(); 1103 unread(c); 1104 parseElement(); 1105 break; 1106 } 1107 } 1108 } 1109 } 1110 1111 /** 1112 * Parse an element type declaration. 1113 * [40] elementdecl ::= '<!ELEMENT' S %Name S (%S S)? %contentspec S? '>' 1114 * [VC: Unique Element Declaration] 1115 * *NOTE: the '<!ELEMENT' has already been read. 1116 */ 1117 void parseElementdecl() throws java.lang.Exception { 1118 String name; 1119 1120 requireWhitespace(); 1121 1122 // Read the element type name. 1123 name = readNmtoken(true); 1124 1125 requireWhitespace(); 1126 1127 // Read the content model. 1128 parseContentspec(name); 1129 1130 skipWhitespace(); 1131 require('>'); 1132 } 1133 1134 /** 1135 * Content specification. 1136 * [41] contentspec ::= 'EMPTY' | 'ANY' | Mixed | elements 1137 */ 1138 void parseContentspec(String name) throws java.lang.Exception { 1139 if (tryRead("EMPTY")) { 1140 setElement(name, CONTENT_EMPTY, null, null); 1141 return; 1142 } else if (tryRead("ANY")) { 1143 setElement(name, CONTENT_ANY, null, null); 1144 return; 1145 } else { 1146 require('('); 1147 dataBufferAppend('('); 1148 skipWhitespace(); 1149 1150 if (tryRead("#PCDATA")) { 1151 dataBufferAppend("#PCDATA"); 1152 parseMixed(); 1153 setElement(name, CONTENT_MIXED, dataBufferToString(), null); 1154 } else { 1155 parseElements(); 1156 setElement(name, CONTENT_ELEMENTS, dataBufferToString(), null); 1157 } 1158 } 1159 } 1160 1161 /** 1162 * Parse an element-content model. 1163 * [42] elements ::= (choice | seq) ('?' | '*' | '+')? 1164 * [44] cps ::= S? %cp S? 1165 * [45] choice ::= '(' S? %ctokplus (S? '|' S? %ctoks)* S? ')' 1166 * [46] ctokplus ::= cps ('|' cps)+ 1167 * [47] ctoks ::= cps ('|' cps)* 1168 * [48] seq ::= '(' S? %stoks (S? ',' S? %stoks)* S? ')' 1169 * [49] stoks ::= cps (',' cps)* 1170 * *NOTE: the opening '(' and S have already been read. 1171 * *TODO: go over parameter entity boundaries more carefully. 1172 */ 1173 void parseElements() throws java.lang.Exception { 1174 char c; 1175 char sep; 1176 1177 // Parse the first content particle 1178 skipWhitespace(); 1179 parseCp(); 1180 1181 // Check for end or for a separator. 1182 skipWhitespace(); 1183 c = readCh(); 1184 1185 switch (c) { 1186 case ')': 1187 dataBufferAppend(')'); 1188 c = readCh(); 1189 1190 switch (c) { 1191 case '*': 1192 case '+': 1193 case '?': 1194 dataBufferAppend(c); 1195 break; 1196 1197 default: 1198 unread(c); 1199 } 1200 1201 return; 1202 1203 case ',': // Register the separator. 1204 case '|': 1205 sep = c; 1206 dataBufferAppend(c); 1207 break; 1208 1209 default: 1210 error("bad separator in content model", c, null); 1211 return; 1212 } 1213 1214 // Parse the rest of the content model. 1215 while (true) { 1216 skipWhitespace(); 1217 parseCp(); 1218 skipWhitespace(); 1219 c = readCh(); 1220 1221 if (c == ')') { 1222 dataBufferAppend(')'); 1223 break; 1224 } else if (c != sep) { 1225 error("bad separator in content model", c, "'" + sep + "'"); 1226 return; 1227 } else { 1228 dataBufferAppend(c); 1229 } 1230 } 1231 1232 // Check for the occurrence indicator. 1233 c = readCh(); 1234 1235 switch (c) { 1236 case '?': 1237 case '*': 1238 case '+': 1239 dataBufferAppend(c); 1240 return; 1241 1242 default: 1243 unread(c); 1244 return; 1245 } 1246 } 1247 1248 /** 1249 * Parse a content particle. 1250 * [43] cp ::= (Name | choice | seq) ('?' | '*' | '+') 1251 * *NOTE: I actually use a slightly different production here: 1252 * cp ::= (elements | (Name ('?' | '*' | '+')?)) 1253 */ 1254 void parseCp() throws java.lang.Exception { 1255 char c; 1256 1257 if (tryRead('(')) { 1258 dataBufferAppend('('); 1259 parseElements(); 1260 } else { 1261 dataBufferAppend(readNmtoken(true)); 1262 c = readCh(); 1263 1264 switch (c) { 1265 case '?': 1266 case '*': 1267 case '+': 1268 dataBufferAppend(c); 1269 break; 1270 1271 default: 1272 unread(c); 1273 break; 1274 } 1275 } 1276 } 1277 1278 /** 1279 * Parse mixed content. 1280 * [50] Mixed ::= '(' S? %( %'#PCDATA' (S? '|' S? %Mtoks)* ) S? ')*' 1281 * | '(' S? %('#PCDATA') S? ')' 1282 * [51] Mtoks ::= %Name (S? '|' S? %Name)* 1283 * *NOTE: the S and '#PCDATA' have already been read. 1284 */ 1285 void parseMixed() throws java.lang.Exception { 1286 // Check for PCDATA alone. 1287 skipWhitespace(); 1288 1289 if (tryRead(')')) { 1290 dataBufferAppend(")*"); 1291 tryRead('*'); 1292 return; 1293 } 1294 1295 // Parse mixed content. 1296 skipWhitespace(); 1297 1298 while (!tryRead(")*")) { 1299 require('|'); 1300 dataBufferAppend('|'); 1301 skipWhitespace(); 1302 dataBufferAppend(readNmtoken(true)); 1303 skipWhitespace(); 1304 } 1305 1306 dataBufferAppend(")*"); 1307 } 1308 1309 /** 1310 * Parse an attribute list declaration. 1311 * [52] AttlistDecl ::= '<!ATTLIST' S %Name S? %AttDef+ S? '>' 1312 * *NOTE: the '<!ATTLIST' has already been read. 1313 */ 1314 void parseAttlistDecl() throws java.lang.Exception { 1315 String elementName; 1316 1317 requireWhitespace(); 1318 elementName = readNmtoken(true); 1319 requireWhitespace(); 1320 1321 while (!tryRead('>')) { 1322 parseAttDef(elementName); 1323 skipWhitespace(); 1324 } 1325 } 1326 1327 /** 1328 * Parse a single attribute definition. 1329 * [53] AttDef ::= S %Name S %AttType S %Default 1330 */ 1331 void parseAttDef(String elementName) throws java.lang.Exception { 1332 String name; 1333 int type; 1334 String enumeration = null; 1335 1336 // Read the attribute name. 1337 name = readNmtoken(true); 1338 1339 // Read the attribute type. 1340 requireWhitespace(); 1341 type = readAttType(); 1342 1343 // Get the string of enumerated values 1344 // if necessary. 1345 if (type == ATTRIBUTE_ENUMERATED || type == ATTRIBUTE_NOTATION) { 1346 enumeration = dataBufferToString(); 1347 } 1348 1349 // Read the default value. 1350 requireWhitespace(); 1351 parseDefault(elementName, name, type, enumeration); 1352 } 1353 1354 /** 1355 * Parse the attribute type. 1356 * [54] AttType ::= StringType | TokenizedType | EnumeratedType 1357 * [55] StringType ::= 'CDATA' 1358 * [56] TokenizedType ::= 'ID' | 'IDREF' | 'IDREFS' | 'ENTITY' | 'ENTITIES' | 1359 * 'NMTOKEN' | 'NMTOKENS' 1360 * [57] EnumeratedType ::= NotationType | Enumeration 1361 * *TODO: validate the type!! 1362 */ 1363 int readAttType() throws java.lang.Exception { 1364 String typeString; 1365 Integer type; 1366 1367 if (tryRead('(')) { 1368 parseEnumeration(); 1369 return ATTRIBUTE_ENUMERATED; 1370 } else { 1371 typeString = readNmtoken(true); 1372 1373 if (typeString.equals("NOTATION")) { 1374 parseNotationType(); 1375 } 1376 1377 type = (Integer) attributeTypeHash.get(typeString); 1378 1379 if (type == null) { 1380 error("illegal attribute type", typeString, null); 1381 return ATTRIBUTE_UNDECLARED; 1382 } else { 1383 return type.intValue(); 1384 } 1385 } 1386 } 1387 1388 /** 1389 * Parse an enumeration. 1390 * [60] Enumeration ::= '(' S? %Etoks (S? '|' S? %Etoks)* S? ')' 1391 * [61] Etoks ::= %Nmtoken (S? '|' S? %Nmtoken)* 1392 * *NOTE: the '(' has already been read. 1393 */ 1394 void parseEnumeration() throws java.lang.Exception { 1395 dataBufferAppend('('); 1396 1397 // Read the first token. 1398 skipWhitespace(); 1399 dataBufferAppend(readNmtoken(true)); 1400 1401 // Read the remaining tokens. 1402 skipWhitespace(); 1403 1404 while (!tryRead(')')) { 1405 require('|'); 1406 dataBufferAppend('|'); 1407 skipWhitespace(); 1408 dataBufferAppend(readNmtoken(true)); 1409 skipWhitespace(); 1410 } 1411 1412 dataBufferAppend(')'); 1413 } 1414 1415 /** 1416 * Parse a notation type for an attribute. 1417 * [58] NotationType ::= %'NOTATION' S '(' S? %Ntoks (S? '|' S? %Ntoks)* 1418 * S? ')' 1419 * [59] Ntoks ::= %Name (S? '|' S? %Name) 1420 * *NOTE: the 'NOTATION' has already been read 1421 */ 1422 void parseNotationType() throws java.lang.Exception { 1423 requireWhitespace(); 1424 require('('); 1425 1426 parseEnumeration(); 1427 } 1428 1429 /** 1430 * Parse the default value for an attribute. 1431 * [62] Default ::= '#REQUIRED' | '#IMPLIED' | ((%'#FIXED' S)? %AttValue 1432 */ 1433 void parseDefault(String elementName, String name, int type, 1434 String enumeration) throws java.lang.Exception { 1435 int valueType = ATTRIBUTE_DEFAULT_SPECIFIED; 1436 String value = null; 1437 1438 if (tryRead('#')) { 1439 if (tryRead("FIXED")) { 1440 valueType = ATTRIBUTE_DEFAULT_FIXED; 1441 requireWhitespace(); 1442 context = CONTEXT_ATTRIBUTEVALUE; 1443 value = readLiteral(LIT_CHAR_REF); 1444 context = CONTEXT_DTD; 1445 } else if (tryRead("REQUIRED")) { 1446 valueType = ATTRIBUTE_DEFAULT_REQUIRED; 1447 } else if (tryRead("IMPLIED")) { 1448 valueType = ATTRIBUTE_DEFAULT_IMPLIED; 1449 } else { 1450 error("illegal keyword for attribute default value", null, 1451 null); 1452 } 1453 } else { 1454 context = CONTEXT_ATTRIBUTEVALUE; 1455 value = readLiteral(LIT_CHAR_REF); 1456 context = CONTEXT_DTD; 1457 } 1458 1459 setAttribute(elementName, name, type, enumeration, value, valueType); 1460 } 1461 1462 /** 1463 * Parse a conditional section. 1464 * [63] conditionalSect ::= includeSect || ignoreSect 1465 * [64] includeSect ::= '<![' %'INCLUDE' '[' (%markupdecl*)* ']]>' 1466 * [65] ignoreSect ::= '<![' %'IGNORE' '[' ignoreSectContents* ']]>' 1467 * [66] ignoreSectContents ::= ((SkipLit | Comment | PI) -(Char* ']]>')) 1468 * | ('<![' ignoreSectContents* ']]>') 1469 * | (Char - (']' | [<'"])) 1470 * | ('<!' (Char - ('-' | '['))) 1471 * *NOTE: the '<![' has already been read. 1472 * *TODO: verify that I am handling ignoreSectContents right. 1473 */ 1474 void parseConditionalSect() throws java.lang.Exception { 1475 skipWhitespace(); 1476 1477 if (tryRead("INCLUDE")) { 1478 skipWhitespace(); 1479 require('['); 1480 skipWhitespace(); 1481 1482 while (!tryRead("]]>")) { 1483 parseMarkupdecl(); 1484 skipWhitespace(); 1485 } 1486 } else if (tryRead("IGNORE")) { 1487 skipWhitespace(); 1488 require('['); 1489 1490 char c; 1491 1492 for (int nest = 1; nest > 0;) { 1493 c = readCh(); 1494 1495 switch (c) { 1496 case '<': 1497 1498 if (tryRead("![")) { 1499 nest++; 1500 } 1501 1502 break; 1503 1504 case ']': 1505 1506 if (tryRead("]>")) { 1507 nest--; 1508 } 1509 1510 break; 1511 } 1512 } 1513 } else { 1514 error("conditional section must begin with INCLUDE or IGNORE", null, 1515 null); 1516 } 1517 } 1518 1519 /** 1520 * Read a character reference. 1521 * [67] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';' 1522 * *NOTE: the '&#' has already been read. 1523 */ 1524 void parseCharRef() throws java.lang.Exception { 1525 int value = 0; 1526 char c; 1527 1528 if (tryRead('x')) { 1529 loop1: while (true) { 1530 c = readCh(); 1531 1532 switch (c) { 1533 case '0': 1534 case '1': 1535 case '2': 1536 case '3': 1537 case '4': 1538 case '5': 1539 case '6': 1540 case '7': 1541 case '8': 1542 case '9': 1543 case 'a': 1544 case 'A': 1545 case 'b': 1546 case 'B': 1547 case 'c': 1548 case 'C': 1549 case 'd': 1550 case 'D': 1551 case 'e': 1552 case 'E': 1553 case 'f': 1554 case 'F': 1555 value *= 16; 1556 value += Integer.parseInt(Character.toString(c), 16); 1557 break; 1558 1559 case ';': 1560 break loop1; 1561 1562 default: 1563 error("illegal character in character reference", c, null); 1564 break loop1; 1565 } 1566 } 1567 } else { 1568 loop2: while (true) { 1569 c = readCh(); 1570 1571 switch (c) { 1572 case '0': 1573 case '1': 1574 case '2': 1575 case '3': 1576 case '4': 1577 case '5': 1578 case '6': 1579 case '7': 1580 case '8': 1581 case '9': 1582 value *= 10; 1583 value += Integer.parseInt(Character.toString(c), 10); 1584 break; 1585 1586 case ';': 1587 break loop2; 1588 1589 default: 1590 error("illegal character in character reference", c, null); 1591 break loop2; 1592 } 1593 } 1594 } 1595 1596 // Check for surrogates: 00000000 0000xxxx yyyyyyyy zzzzzzzz 1597 // (1101|10xx|xxyy|yyyy + 1101|11yy|zzzz|zzzz: 1598 if (value <= 0x0000ffff) { 1599 // no surrogates needed 1600 dataBufferAppend((char) value); 1601 } else if (value <= 0x000fffff) { 1602 // > 16 bits, surrogate needed 1603 dataBufferAppend((char) (0xd8 | (value & 0x000ffc00) >> 10)); 1604 dataBufferAppend((char) (0xdc | value & 0x0003ff)); 1605 } else { 1606 // too big for surrogate 1607 error("character reference " + value + " is too large for UTF-16", 1608 Integer.valueOf(value).toString(), null); 1609 } 1610 } 1611 1612 /** 1613 * Parse a reference. 1614 * [69] EntityRef ::= '&' Name ';' 1615 * *NOTE: the '&' has already been read. 1616 * @param externalAllowed External entities are allowed here. 1617 */ 1618 void parseEntityRef(boolean externalAllowed) throws java.lang.Exception { 1619 String name; 1620 1621 name = readNmtoken(true); 1622 require(';'); 1623 1624 switch (getEntityType(name)) { 1625 case ENTITY_UNDECLARED: 1626 error("reference to undeclared entity", name, null); 1627 break; 1628 1629 case ENTITY_INTERNAL: 1630 pushString(name, getEntityValue(name)); 1631 break; 1632 1633 case ENTITY_TEXT: 1634 1635 if (externalAllowed) { 1636 pushURL(name, getEntityPublicId(name), getEntitySystemId(name), 1637 null, null, null); 1638 } else { 1639 error("reference to external entity in attribute value.", name, 1640 null); 1641 } 1642 1643 break; 1644 1645 case ENTITY_NDATA: 1646 1647 if (externalAllowed) { 1648 error("data entity reference in content", name, null); 1649 } else { 1650 error("reference to external entity in attribute value.", name, 1651 null); 1652 } 1653 1654 break; 1655 } 1656 } 1657 1658 /** 1659 * Parse a parameter entity reference. 1660 * [70] PEReference ::= '%' Name ';' 1661 * *NOTE: the '%' has already been read. 1662 */ 1663 void parsePEReference(boolean isEntityValue) throws java.lang.Exception { 1664 String name; 1665 1666 name = "%" + readNmtoken(true); 1667 require(';'); 1668 1669 switch (getEntityType(name)) { 1670 case ENTITY_UNDECLARED: 1671 error("reference to undeclared parameter entity", name, null); 1672 break; 1673 1674 case ENTITY_INTERNAL: 1675 1676 if (isEntityValue) { 1677 pushString(name, getEntityValue(name)); 1678 } else { 1679 pushString(name, " " + getEntityValue(name) + ' '); 1680 } 1681 1682 break; 1683 1684 case ENTITY_TEXT: 1685 1686 if (isEntityValue) { 1687 pushString(null, " "); 1688 } 1689 1690 pushURL(name, getEntityPublicId(name), getEntitySystemId(name), 1691 null, null, null); 1692 1693 if (isEntityValue) { 1694 pushString(null, " "); 1695 } 1696 1697 break; 1698 } 1699 } 1700 1701 /** 1702 * Parse an entity declaration. 1703 * [71] EntityDecl ::= '<!ENTITY' S %Name S %EntityDef S? '>' 1704 * | '<!ENTITY' S '%' S %Name S %EntityDef S? '>' 1705 * [72] EntityDef ::= EntityValue | ExternalDef 1706 * [73] ExternalDef ::= ExternalID %NDataDecl? 1707 * [74] ExternalID ::= 'SYSTEM' S SystemLiteral 1708 * | 'PUBLIC' S PubidLiteral S SystemLiteral 1709 * [75] NDataDecl ::= S %'NDATA' S %Name 1710 * *NOTE: the '<!ENTITY' has already been read. 1711 */ 1712 void parseEntityDecl() throws java.lang.Exception { 1713 char c; 1714 boolean peFlag = false; 1715 String name; 1716 String value; 1717 String notationName; 1718 String[] ids; 1719 1720 // Check for a parameter entity. 1721 requireWhitespace(); 1722 1723 if (tryRead('%')) { 1724 peFlag = true; 1725 requireWhitespace(); 1726 } 1727 1728 // Read the entity name, and prepend 1729 // '%' if necessary. 1730 name = readNmtoken(true); 1731 1732 if (peFlag) { 1733 name = "%" + name; 1734 } 1735 1736 // Read the entity value. 1737 requireWhitespace(); 1738 c = readCh(); 1739 unread(c); 1740 1741 if (c == '"' || c == '\'') { 1742 // Internal entity. 1743 context = CONTEXT_ENTITYVALUE; 1744 value = readLiteral(LIT_CHAR_REF | LIT_PE_REF); 1745 context = CONTEXT_DTD; 1746 setInternalEntity(name, value); 1747 } else { 1748 // Read the external IDs 1749 ids = readExternalIds(false); 1750 1751 if (ids[1] == null) { 1752 error("system identifier missing", name, null); 1753 } 1754 1755 // Check for NDATA declaration. 1756 skipWhitespace(); 1757 1758 if (tryRead("NDATA")) { 1759 requireWhitespace(); 1760 notationName = readNmtoken(true); 1761 setExternalDataEntity(name, ids[0], ids[1], notationName); 1762 } else { 1763 setExternalTextEntity(name, ids[0], ids[1]); 1764 } 1765 } 1766 1767 // Finish the declaration. 1768 skipWhitespace(); 1769 require('>'); 1770 } 1771 1772 /** 1773 * Parse a notation declaration. 1774 * [81] NotationDecl ::= '<!NOTATION' S %Name S %ExternalID S? '>' 1775 * *NOTE: the '<!NOTATION' has already been read. 1776 */ 1777 void parseNotationDecl() throws java.lang.Exception { 1778 String nname; 1779 String[] ids; 1780 1781 requireWhitespace(); 1782 nname = readNmtoken(true); 1783 1784 requireWhitespace(); 1785 1786 // Read the external identifiers. 1787 ids = readExternalIds(true); 1788 1789 if (ids[0] == null && ids[1] == null) { 1790 error("external identifier missing", nname, null); 1791 } 1792 1793 // Register the notation. 1794 setNotation(nname, ids[0], ids[1]); 1795 1796 skipWhitespace(); 1797 require('>'); 1798 } 1799 1800 /** 1801 * Parse PCDATA. 1802 * <pre> 1803 * [16] PCData ::= [^<&]* 1804 * </pre> 1805 * <p>The trick here is that the data stays in the dataBuffer without 1806 * necessarily being converted to a string right away. 1807 */ 1808 void parsePCData() throws java.lang.Exception { 1809 char c; 1810 1811 // Start with a little cheat -- in most 1812 // cases, the entire sequence of 1813 // character data will already be in 1814 // the readBuffer; if not, fall through to 1815 // the normal approach. 1816 if (USE_CHEATS) { 1817 int lineAugment = 0; 1818 int columnAugment = 0; 1819 1820 /*loop:*/for (int i = readBufferPos; i < readBufferLength; i++) { 1821 switch (readBuffer[i]) { 1822 case '\n': 1823 lineAugment++; 1824 columnAugment = 0; 1825 break; 1826 1827 case '&': 1828 case '<': 1829 1830 int start = readBufferPos; 1831 columnAugment++; 1832 readBufferPos = i; 1833 1834 if (lineAugment > 0) { 1835 line += lineAugment; 1836 column = columnAugment; 1837 } else { 1838 column += columnAugment; 1839 } 1840 1841 dataBufferAppend(readBuffer, start, i - start); 1842 return; 1843 1844 default: 1845 columnAugment++; 1846 } 1847 } 1848 } 1849 1850 // OK, the cheat didn't work; start over 1851 // and do it by the book. 1852 while (true) { 1853 c = readCh(); 1854 1855 switch (c) { 1856 case '<': 1857 case '&': 1858 unread(c); 1859 return; 1860 1861 default: 1862 dataBufferAppend(c); 1863 break; 1864 } 1865 } 1866 } 1867 1868 ////////////////////////////////////////////////////////////////////// 1869 // High-level reading and scanning methods. 1870 ////////////////////////////////////////////////////////////////////// 1871 1872 /** 1873 * Require whitespace characters. 1874 * [1] S ::= (#x20 | #x9 | #xd | #xa)+ 1875 */ 1876 void requireWhitespace() throws java.lang.Exception { 1877 char c = readCh(); 1878 1879 if (isWhitespace(c)) { 1880 skipWhitespace(); 1881 } else { 1882 error("whitespace expected", c, null); 1883 } 1884 } 1885 1886 /** 1887 * Parse whitespace characters, and leave them in the data buffer. 1888 */ 1889 void parseWhitespace() throws java.lang.Exception { 1890 char c = readCh(); 1891 1892 while (isWhitespace(c)) { 1893 dataBufferAppend(c); 1894 c = readCh(); 1895 } 1896 1897 unread(c); 1898 } 1899 1900 /** 1901 * Skip whitespace characters. 1902 * [1] S ::= (#x20 | #x9 | #xd | #xa)+ 1903 */ 1904 void skipWhitespace() throws java.lang.Exception { 1905 // Start with a little cheat. Most of 1906 // the time, the white space will fall 1907 // within the current read buffer; if 1908 // not, then fall through. 1909 if (USE_CHEATS) { 1910 int lineAugment = 0; 1911 int columnAugment = 0; 1912 1913 loop: for (int i = readBufferPos; i < readBufferLength; i++) { 1914 switch (readBuffer[i]) { 1915 case ' ': 1916 case '\t': 1917 case '\r': 1918 columnAugment++; 1919 break; 1920 1921 case '\n': 1922 lineAugment++; 1923 columnAugment = 0; 1924 break; 1925 1926 case '%': 1927 1928 if (context == CONTEXT_DTD 1929 || context == CONTEXT_ENTITYVALUE) { 1930 break loop; 1931 } // else fall through... 1932 1933 default: 1934 readBufferPos = i; 1935 1936 if (lineAugment > 0) { 1937 line += lineAugment; 1938 column = columnAugment; 1939 } else { 1940 column += columnAugment; 1941 } 1942 1943 return; 1944 } 1945 } 1946 } 1947 1948 // OK, do it by the book. 1949 char c = readCh(); 1950 1951 while (isWhitespace(c)) { 1952 c = readCh(); 1953 } 1954 1955 unread(c); 1956 } 1957 1958 /** 1959 * Read a name or name token. 1960 * [5] Name ::= (Letter | '_' | ':') (NameChar)* 1961 * [7] Nmtoken ::= (NameChar)+ 1962 * *NOTE: [6] is implemented implicitly where required. 1963 */ 1964 String readNmtoken(boolean isName) throws java.lang.Exception { 1965 char c; 1966 1967 if (USE_CHEATS) { 1968 loop: for (int i = readBufferPos; i < readBufferLength; i++) { 1969 switch (readBuffer[i]) { 1970 case '%': 1971 1972 if (context == CONTEXT_DTD 1973 || context == CONTEXT_ENTITYVALUE) { 1974 break loop; 1975 } // else fall through... 1976 1977 case '<': 1978 case '>': 1979 case '&': 1980 case ',': 1981 case '|': 1982 case '*': 1983 case '+': 1984 case '?': 1985 case ')': 1986 case '=': 1987 case '\'': 1988 case '"': 1989 case '[': 1990 case ' ': 1991 case '\t': 1992 case '\r': 1993 case '\n': 1994 case ';': 1995 case '/': 1996 case '#': 1997 1998 int start = readBufferPos; 1999 2000 if (i == start) { 2001 error("name expected", readBuffer[i], null); 2002 } 2003 2004 readBufferPos = i; 2005 return intern(readBuffer, start, i - start); 2006 } 2007 } 2008 } 2009 2010 nameBufferPos = 0; 2011 2012 // Read the first character. 2013 /*loop: */while (true) { 2014 c = readCh(); 2015 2016 switch (c) { 2017 case '%': 2018 case '<': 2019 case '>': 2020 case '&': 2021 case ',': 2022 case '|': 2023 case '*': 2024 case '+': 2025 case '?': 2026 case ')': 2027 case '=': 2028 case '\'': 2029 case '"': 2030 case '[': 2031 case ' ': 2032 case '\t': 2033 case '\n': 2034 case '\r': 2035 case ';': 2036 case '/': 2037 unread(c); 2038 2039 if (nameBufferPos == 0) { 2040 error("name expected", null, null); 2041 } 2042 2043 String s = intern(nameBuffer, 0, nameBufferPos); 2044 nameBufferPos = 0; 2045 return s; 2046 2047 default: 2048 nameBuffer = (char[]) extendArray(nameBuffer, nameBuffer.length, 2049 nameBufferPos); 2050 nameBuffer[nameBufferPos++] = c; 2051 } 2052 } 2053 } 2054 2055 /** 2056 * Read a literal. 2057 * [10] AttValue ::= '"' ([^<&"] | Reference)* '"' 2058 * | "'" ([^<&'] | Reference)* "'" 2059 * [11] SystemLiteral ::= '"' URLchar* '"' | "'" (URLchar - "'")* "'" 2060 * [13] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'" 2061 * [9] EntityValue ::= '"' ([^%&"] | PEReference | Reference)* '"' 2062 * | "'" ([^%&'] | PEReference | Reference)* "'" 2063 */ 2064 String readLiteral(int flags) throws java.lang.Exception { 2065 char delim; 2066 char c; 2067 int startLine = line; 2068 2069 // Find the delimiter. 2070 delim = readCh(); 2071 2072 if (delim != '"' && delim != '\'' && delim != (char) 0) { 2073 error("expected '\"' or \"'\"", delim, null); 2074 return null; 2075 } 2076 2077 // Read the literal. 2078 try { 2079 c = readCh(); 2080 2081 loop: while (c != delim) { 2082 switch (c) { 2083 // Literals never have line ends 2084 case '\n': 2085 case '\r': 2086 c = ' '; 2087 break; 2088 2089 // References may be allowed 2090 case '&': 2091 2092 if ((flags & LIT_CHAR_REF) > 0) { 2093 c = readCh(); 2094 2095 if (c == '#') { 2096 parseCharRef(); 2097 c = readCh(); 2098 continue loop; // check the next character 2099 } else if ((flags & LIT_ENTITY_REF) > 0) { 2100 unread(c); 2101 parseEntityRef(false); 2102 c = readCh(); 2103 continue loop; 2104 } else { 2105 dataBufferAppend('&'); 2106 } 2107 } 2108 2109 break; 2110 2111 default: 2112 break; 2113 } 2114 2115 dataBufferAppend(c); 2116 c = readCh(); 2117 } 2118 } catch (EOFException e) { 2119 error("end of input while looking for delimiter (started on line " 2120 + startLine + ')', null, Character.toString(delim)); 2121 } 2122 2123 // Normalise whitespace if necessary. 2124 if ((flags & LIT_NORMALIZE) > 0) { 2125 dataBufferNormalize(); 2126 } 2127 2128 // Return the value. 2129 return dataBufferToString(); 2130 } 2131 2132 /** 2133 * Try reading external identifiers. 2134 * <p>The system identifier is not required for notations. 2135 * @param inNotation Are we in a notation? 2136 * @return A two-member String array containing the identifiers. 2137 */ 2138 String[] readExternalIds(boolean inNotation) throws java.lang.Exception { 2139 String[] ids = new String[2]; 2140 2141 if (tryRead("PUBLIC")) { 2142 requireWhitespace(); 2143 ids[0] = readLiteral(LIT_NORMALIZE); // public id 2144 2145 if (inNotation) { 2146 skipWhitespace(); 2147 2148 if (tryRead('"') || tryRead('\'')) { 2149 ids[1] = readLiteral(0); 2150 } 2151 } else { 2152 requireWhitespace(); 2153 ids[1] = readLiteral(0); // system id 2154 } 2155 } else if (tryRead("SYSTEM")) { 2156 requireWhitespace(); 2157 ids[1] = readLiteral(0); // system id 2158 } 2159 2160 return ids; 2161 } 2162 2163 /** 2164 * Test if a character is whitespace. 2165 * <pre> 2166 * [1] S ::= (#x20 | #x9 | #xd | #xa)+ 2167 * </pre> 2168 * @param c The character to test. 2169 * @return true if the character is whitespace. 2170 */ 2171 final boolean isWhitespace(char c) { 2172 switch (c) { 2173 case 0x20: 2174 case 0x09: 2175 case 0x0d: 2176 case 0x0a: 2177 return true; 2178 2179 default: 2180 return false; 2181 } 2182 } 2183 2184 ////////////////////////////////////////////////////////////////////// 2185 // Utility routines. 2186 ////////////////////////////////////////////////////////////////////// 2187 2188 /** 2189 * Add a character to the data buffer. 2190 */ 2191 void dataBufferAppend(char c) { 2192 // Expand buffer if necessary. 2193 if (dataBufferPos >= dataBuffer.length) { 2194 2195 // dataBufferAppend() gets called alot, so instead of 2196 // calling extendArray() here, we optimize the heck out of this 2197 // code. 2198 //dataBuffer = (char[]) extendArray(dataBuffer, dataBuffer.length, 2199 // dataBufferPos); 2200 2201 final int currentSize = dataBuffer.length; 2202 int newSize = currentSize * 2; 2203 2204 if (newSize <= dataBufferPos) { 2205 newSize = dataBufferPos + 1; 2206 } 2207 2208 // Dwight Richards pointed out that newSize was ignored (11/03) 2209 char[] newArray = new char[newSize]; 2210 2211 System.arraycopy(dataBuffer, 0, newArray, 0, currentSize); 2212 dataBuffer = newArray; 2213 } 2214 dataBuffer[dataBufferPos++] = c; 2215 } 2216 2217 /** 2218 * Add a string to the data buffer. 2219 */ 2220 void dataBufferAppend(String s) { 2221 dataBufferAppend(s.toCharArray(), 0, s.length()); 2222 } 2223 2224 /** 2225 * Append (part of) a character array to the data buffer. 2226 */ 2227 void dataBufferAppend(char[] ch, int start, int length) { 2228 dataBuffer = (char[]) extendArray(dataBuffer, dataBuffer.length, 2229 dataBufferPos + length); 2230 System.arraycopy(ch, start, dataBuffer, dataBufferPos, length); 2231 dataBufferPos += length; 2232 } 2233 2234 /** 2235 * Normalise whitespace in the data buffer. 2236 */ 2237 void dataBufferNormalize() { 2238 int i = 0; 2239 int j = 0; 2240 int end = dataBufferPos; 2241 2242 // Skip whitespace at the start. 2243 while (j < end && isWhitespace(dataBuffer[j])) { 2244 j++; 2245 } 2246 2247 // Skip whitespace at the end. 2248 while (end > j && isWhitespace(dataBuffer[end - 1])) { 2249 end--; 2250 } 2251 2252 // Start copying to the left. 2253 while (j < end) { 2254 char c = dataBuffer[j++]; 2255 2256 // Normalise all other whitespace to 2257 // a single space. 2258 if (isWhitespace(c)) { 2259 while (j < end && isWhitespace(dataBuffer[j++])) { 2260 } 2261 2262 dataBuffer[i++] = ' '; 2263 dataBuffer[i++] = dataBuffer[j - 1]; 2264 } else { 2265 dataBuffer[i++] = c; 2266 } 2267 } 2268 2269 // The new length is <= the old one. 2270 dataBufferPos = i; 2271 } 2272 2273 /** 2274 * Convert the data buffer to a string. 2275 * @see #intern(char[],int,int) 2276 */ 2277 String dataBufferToString() { 2278 String s = new String(dataBuffer, 0, dataBufferPos); 2279 dataBufferPos = 0; 2280 return s; 2281 } 2282 2283 /** 2284 * Flush the contents of the data buffer to the handler, if 2285 * appropriate, and reset the buffer for new input. 2286 */ 2287 void dataBufferFlush() throws java.lang.Exception { 2288 if (dataBufferPos > 0) { 2289 switch (currentElementContent) { 2290 case CONTENT_UNDECLARED: 2291 case CONTENT_EMPTY: 2292 2293 // do nothing 2294 break; 2295 2296 case CONTENT_MIXED: 2297 case CONTENT_ANY: 2298 2299 if (handler != null) { 2300 handler.charData(dataBuffer, 0, dataBufferPos); 2301 } 2302 2303 break; 2304 2305 case CONTENT_ELEMENTS: 2306 2307 if (handler != null) { 2308 handler.ignorableWhitespace(dataBuffer, 0, dataBufferPos); 2309 } 2310 2311 break; 2312 } 2313 2314 dataBufferPos = 0; 2315 } 2316 } 2317 2318 /** 2319 * Require a string to appear, or throw an exception. 2320 */ 2321 void require(String delim) throws java.lang.Exception { 2322 char[] ch = delim.toCharArray(); 2323 2324 for (char element : ch) { 2325 require(element); 2326 } 2327 } 2328 2329 /** 2330 * Require a character to appear, or throw an exception. 2331 */ 2332 void require(char delim) throws java.lang.Exception { 2333 char c = readCh(); 2334 2335 if (c != delim) { 2336 error("expected character", c, Character.toString(delim)); 2337 } 2338 } 2339 2340 /** 2341 * Return an internalised version of a string. 2342 * <p>Ælfred uses this method to create an internalised version 2343 * of all names and attribute values, so that it can test equality 2344 * with <code>==</code> instead of <code>String.equals()</code>. 2345 * <p>If you want to be able to test for equality in the same way, 2346 * you can use this method to internalise your own strings first: 2347 * <pre> 2348 * String PARA = handler.intern("PARA"); 2349 * </pre> 2350 * <p>Note that this will not return the same results as String.intern(). 2351 * @param s The string to internalise. 2352 * @return An internalised version of the string. 2353 * @see #intern(char[],int,int) 2354 * @see java.lang.String#intern 2355 */ 2356 public String intern(String s) { 2357 char[] ch = s.toCharArray(); 2358 return intern(ch, 0, ch.length); 2359 } 2360 2361 /** 2362 * Create an internalised string from a character array. 2363 * <p>This is much more efficient than constructing a non-internalised 2364 * string first, and then internalising it. 2365 * <p>Note that this will not return the same results as String.intern(). 2366 * @param ch an array of characters for building the string. 2367 * @param start the starting position in the array. 2368 * @param length the number of characters to place in the string. 2369 * @return an internalised string. 2370 * @see #intern(String) 2371 * @see java.lang.String#intern 2372 */ 2373 public String intern(char[] ch, int start, int length) { 2374 int index; 2375 int hash = 0; 2376 2377 // Generate a hash code. 2378 for (int i = start; i < start + length; i++) { 2379 hash = (hash << 1 & 0xffffff) + ch[i]; 2380 } 2381 2382 hash = hash % SYMBOL_TABLE_LENGTH; 2383 2384 // Get the bucket. 2385 Object[] bucket = (Object[]) symbolTable[hash]; 2386 2387 if (bucket == null) { 2388 symbolTable[hash] = bucket = new Object[8]; 2389 } 2390 2391 // Search for a matching tuple, and 2392 // return the string if we find one. 2393 for (index = 0; index < bucket.length; index += 2) { 2394 char[] chFound = (char[]) bucket[index]; 2395 2396 // Stop when we hit a null index. 2397 if (chFound == null) { 2398 break; 2399 } 2400 2401 // If they're the same length, 2402 // check for a match. 2403 // If the loop finishes, 'index' will 2404 // contain the current bucket 2405 // position. 2406 if (chFound.length == length) { 2407 for (int i = 0; i < chFound.length; i++) { 2408 // Stop if there are no more tuples. 2409 if (ch[start + i] != chFound[i]) { 2410 break; 2411 } else if (i == length - 1) { 2412 // That's it, we have a match! 2413 return (String) bucket[index + 1]; 2414 } 2415 } 2416 } 2417 } 2418 2419 // Not found -- we'll have to add it. 2420 // Do we have to grow the bucket? 2421 bucket = (Object[]) extendArray(bucket, bucket.length, index); 2422 2423 // OK, add it to the end of the 2424 // bucket. 2425 String s = new String(ch, start, length); 2426 bucket[index] = s.toCharArray(); 2427 bucket[index + 1] = s; 2428 symbolTable[hash] = bucket; 2429 return s; 2430 } 2431 2432 /** 2433 * Ensure the capacity of an array, allocating a new one if 2434 * necessary. 2435 */ 2436 Object extendArray(Object array, int currentSize, int requiredSize) { 2437 if (requiredSize < currentSize) { 2438 return array; 2439 } else { 2440 Object newArray = null; 2441 int newSize = currentSize * 2; 2442 2443 if (newSize <= requiredSize) { 2444 newSize = requiredSize + 1; 2445 } 2446 2447 // Dwight Richards pointed out that newSize was ignored (11/03) 2448 if (array instanceof char[]) { 2449 newArray = new char[newSize]; 2450 } else if (array instanceof Object[]) { 2451 newArray = new Object[newSize]; 2452 } else { 2453 throw new RuntimeException("Array must be char[] or Object[]"); 2454 } 2455 2456 System.arraycopy(array, 0, newArray, 0, currentSize); 2457 return newArray; 2458 } 2459 } 2460 2461 ////////////////////////////////////////////////////////////////////// 2462 // XML query routines. 2463 ////////////////////////////////////////////////////////////////////// 2464 // 2465 // Elements 2466 // 2467 2468 /** 2469 * Get the declared elements for an XML document. 2470 * <p>The results will be valid only after the DTD (if any) has been 2471 * parsed. 2472 * @return An enumeration of all element types declared for this 2473 * document (as Strings). 2474 * @see #getElementContentType 2475 * @see #getElementContentModel 2476 */ 2477 public Enumeration declaredElements() { 2478 return elementInfo.keys(); 2479 } 2480 2481 /** 2482 * Look up the content type of an element. 2483 * @param name The element type name. 2484 * @return An integer constant representing the content type. 2485 * @see #getElementContentModel 2486 * @see #CONTENT_UNDECLARED 2487 * @see #CONTENT_ANY 2488 * @see #CONTENT_EMPTY 2489 * @see #CONTENT_MIXED 2490 * @see #CONTENT_ELEMENTS 2491 */ 2492 public int getElementContentType(String name) { 2493 Object[] element = (Object[]) elementInfo.get(name); 2494 2495 if (element == null) { 2496 return CONTENT_UNDECLARED; 2497 } else { 2498 return ((Integer) element[0]).intValue(); 2499 } 2500 } 2501 2502 /** 2503 * Look up the content model of an element. 2504 * <p>The result will always be null unless the content type is 2505 * CONTENT_ELEMENTS or CONTENT_MIXED. 2506 * @param name The element type name. 2507 * @return The normalised content model, as a string. 2508 * @see #getElementContentType 2509 */ 2510 public String getElementContentModel(String name) { 2511 Object[] element = (Object[]) elementInfo.get(name); 2512 2513 if (element == null) { 2514 return null; 2515 } else { 2516 return (String) element[1]; 2517 } 2518 } 2519 2520 /** 2521 * Register an element. 2522 * Array format: 2523 * element type 2524 * attribute hash table 2525 */ 2526 void setElement(String name, int contentType, String contentModel, 2527 Hashtable attributes) throws java.lang.Exception { 2528 Object[] element; 2529 2530 // Try looking up the element 2531 element = (Object[]) elementInfo.get(name); 2532 2533 // Make a new one if necessary. 2534 if (element == null) { 2535 element = new Object[3]; 2536 element[0] = Integer.valueOf(CONTENT_UNDECLARED); 2537 element[1] = null; 2538 element[2] = null; 2539 } else if (contentType != CONTENT_UNDECLARED 2540 && ((Integer) element[0]).intValue() != CONTENT_UNDECLARED) { 2541 error("multiple declarations for element type", name, null); 2542 return; 2543 } 2544 2545 // Insert the content type, if any. 2546 if (contentType != CONTENT_UNDECLARED) { 2547 element[0] = Integer.valueOf(contentType); 2548 } 2549 2550 // Insert the content model, if any. 2551 if (contentModel != null) { 2552 element[1] = contentModel; 2553 } 2554 2555 // Insert the attributes, if any. 2556 if (attributes != null) { 2557 element[2] = attributes; 2558 } 2559 2560 // Save the element info. 2561 elementInfo.put(name, element); 2562 } 2563 2564 /** 2565 * Look up the attribute hash table for an element. 2566 * The hash table is the second item in the element array. 2567 */ 2568 Hashtable getElementAttributes(String name) { 2569 Object[] element = (Object[]) elementInfo.get(name); 2570 2571 if (element == null) { 2572 return null; 2573 } else { 2574 return (Hashtable) element[2]; 2575 } 2576 } 2577 2578 // 2579 // Attributes 2580 // 2581 2582 /** 2583 * Get the declared attributes for an element type. 2584 * @param elname The name of the element type. 2585 * @return An Enumeration of all the attributes declared for 2586 * a specific element type. The results will be valid only 2587 * after the DTD (if any) has been parsed. 2588 * @see #getAttributeType 2589 * @see #getAttributeEnumeration 2590 * @see #getAttributeDefaultValueType 2591 * @see #getAttributeDefaultValue 2592 * @see #getAttributeExpandedValue 2593 */ 2594 public Enumeration declaredAttributes(String elname) { 2595 Hashtable attlist = getElementAttributes(elname); 2596 2597 if (attlist == null) { 2598 return null; 2599 } else { 2600 return attlist.keys(); 2601 } 2602 } 2603 2604 /** 2605 * Retrieve the declared type of an attribute. 2606 * @param name The name of the associated element. 2607 * @param aname The name of the attribute. 2608 * @return An integer constant representing the attribute type. 2609 * @see #ATTRIBUTE_UNDECLARED 2610 * @see #ATTRIBUTE_CDATA 2611 * @see #ATTRIBUTE_ID 2612 * @see #ATTRIBUTE_IDREF 2613 * @see #ATTRIBUTE_IDREFS 2614 * @see #ATTRIBUTE_ENTITY 2615 * @see #ATTRIBUTE_ENTITIES 2616 * @see #ATTRIBUTE_NMTOKEN 2617 * @see #ATTRIBUTE_NMTOKENS 2618 * @see #ATTRIBUTE_ENUMERATED 2619 * @see #ATTRIBUTE_NOTATION 2620 */ 2621 public int getAttributeType(String name, String aname) { 2622 Object[] attribute = getAttribute(name, aname); 2623 2624 if (attribute == null) { 2625 return ATTRIBUTE_UNDECLARED; 2626 } else { 2627 return ((Integer) attribute[0]).intValue(); 2628 } 2629 } 2630 2631 /** 2632 * Retrieve the allowed values for an enumerated attribute type. 2633 * @param name The name of the associated element. 2634 * @param aname The name of the attribute. 2635 * @return A string containing the token list. 2636 * @see #ATTRIBUTE_ENUMERATED 2637 * @see #ATTRIBUTE_NOTATION 2638 */ 2639 public String getAttributeEnumeration(String name, String aname) { 2640 Object[] attribute = getAttribute(name, aname); 2641 2642 if (attribute == null) { 2643 return null; 2644 } else { 2645 return (String) attribute[3]; 2646 } 2647 } 2648 2649 /** 2650 * Retrieve the default value of a declared attribute. 2651 * @param name The name of the associated element. 2652 * @param aname The name of the attribute. 2653 * @return The default value, or null if the attribute was 2654 * #IMPLIED or simply undeclared and unspecified. 2655 * @see #getAttributeExpandedValue 2656 */ 2657 public String getAttributeDefaultValue(String name, String aname) { 2658 Object[] attribute = getAttribute(name, aname); 2659 2660 if (attribute == null) { 2661 return null; 2662 } else { 2663 return (String) attribute[1]; 2664 } 2665 } 2666 2667 /** 2668 * Retrieve the expanded value of a declared attribute. 2669 * <p>All general entities will be expanded. 2670 * @param name The name of the associated element. 2671 * @param aname The name of the attribute. 2672 * @return The expanded default value, or null if the attribute was 2673 * #IMPLIED or simply undeclared 2674 * @see #getAttributeDefaultValue 2675 */ 2676 public String getAttributeExpandedValue(String name, String aname) { 2677 Object[] attribute = getAttribute(name, aname); 2678 2679 if (attribute == null) { 2680 return null; 2681 } else if (attribute[4] == null && attribute[1] != null) { 2682 try { 2683 pushString(null, (char) 0 + (String) attribute[1] + (char) 0); 2684 attribute[4] = readLiteral( 2685 LIT_NORMALIZE | LIT_CHAR_REF | LIT_ENTITY_REF); 2686 } catch (Exception ex) { 2687 // We could ignore this and return but instead return here. 2688 return (String) attribute[4]; 2689 } 2690 } 2691 2692 return (String) attribute[4]; 2693 } 2694 2695 /** 2696 * Retrieve the default value type of a declared attribute. 2697 * @param name The name of the element. 2698 * @param aname The name of the attribute. 2699 * @return ATTRIBUTE_DEFAULT_UNDECLARED if the attribute 2700 * cannot be found, otherwise return an integer. 2701 * @see #ATTRIBUTE_DEFAULT_SPECIFIED 2702 * @see #ATTRIBUTE_DEFAULT_IMPLIED 2703 * @see #ATTRIBUTE_DEFAULT_REQUIRED 2704 * @see #ATTRIBUTE_DEFAULT_FIXED 2705 */ 2706 public int getAttributeDefaultValueType(String name, String aname) { 2707 Object[] attribute = getAttribute(name, aname); 2708 2709 if (attribute == null) { 2710 return ATTRIBUTE_DEFAULT_UNDECLARED; 2711 } else { 2712 return ((Integer) attribute[2]).intValue(); 2713 } 2714 } 2715 2716 /** 2717 * Register an attribute declaration for later retrieval. 2718 * Format: 2719 * - String type 2720 * - String default value 2721 * - int value type 2722 * *TODO: do something with attribute types. 2723 */ 2724 void setAttribute(String elName, String name, int type, String enumeration, 2725 String value, int valueType) throws java.lang.Exception { 2726 Hashtable attlist; 2727 Object[] attribute; 2728 2729 // Create a new hashtable if necessary. 2730 attlist = getElementAttributes(elName); 2731 2732 if (attlist == null) { 2733 attlist = new Hashtable(); 2734 } 2735 2736 // Check that the attribute doesn't 2737 // already exist! 2738 if (attlist.get(name) != null) { 2739 return; 2740 } else { 2741 attribute = new Object[5]; 2742 attribute[0] = Integer.valueOf(type); 2743 attribute[1] = value; 2744 attribute[2] = Integer.valueOf(valueType); 2745 attribute[3] = enumeration; 2746 attribute[4] = null; 2747 attlist.put(name.intern(), attribute); 2748 2749 // Use CONTENT_UNDECLARED to avoid overwriting 2750 // existing element declaration. 2751 setElement(elName, CONTENT_UNDECLARED, null, attlist); 2752 } 2753 } 2754 2755 /** 2756 * Retrieve the three-member array representing an 2757 * attribute declaration. 2758 * @param elName The name of the element. 2759 * @param name The name of the attribute. 2760 */ 2761 Object[] getAttribute(String elName, String name) { 2762 Hashtable attlist; 2763 Object[] attribute; 2764 2765 attlist = getElementAttributes(elName); 2766 2767 if (attlist == null) { 2768 return null; 2769 } 2770 2771 attribute = (Object[]) attlist.get(name); 2772 return attribute; 2773 } 2774 2775 // 2776 // Entities 2777 // 2778 2779 /** 2780 * Get declared entities. 2781 * @return An Enumeration of all the entities declared for 2782 * this XML document. The results will be valid only 2783 * after the DTD (if any) has been parsed. 2784 * @see #getEntityType 2785 * @see #getEntityPublicId 2786 * @see #getEntitySystemId 2787 * @see #getEntityValue 2788 * @see #getEntityNotationName 2789 */ 2790 public Enumeration declaredEntities() { 2791 return entityInfo.keys(); 2792 } 2793 2794 /** Return the current element. 2795 * @return The current Element. 2796 */ 2797 public String getCurrentElement() { 2798 // Ptolemy localization for MoMLParser so that we 2799 // can get the currentElement from within MoMLParser.attribute() 2800 return currentElement; 2801 } 2802 2803 /** 2804 * Find the type of an entity. 2805 * @param ename The name of the entity. 2806 * @return An integer constant representing the entity type. 2807 * @see #ENTITY_UNDECLARED 2808 * @see #ENTITY_INTERNAL 2809 * @see #ENTITY_NDATA 2810 * @see #ENTITY_TEXT 2811 */ 2812 public int getEntityType(String ename) { 2813 Object[] entity = (Object[]) entityInfo.get(ename); 2814 2815 if (entity == null) { 2816 return ENTITY_UNDECLARED; 2817 } else { 2818 return ((Integer) entity[0]).intValue(); 2819 } 2820 } 2821 2822 /** 2823 * Return an external entity's public identifier, if any. 2824 * @param ename The name of the external entity. 2825 * @return The entity's system identifier, or null if the 2826 * entity was not declared, if it is not an 2827 * external entity, or if no public identifier was 2828 * provided. 2829 * @see #getEntityType 2830 */ 2831 public String getEntityPublicId(String ename) { 2832 Object[] entity = (Object[]) entityInfo.get(ename); 2833 2834 if (entity == null) { 2835 return null; 2836 } else { 2837 return (String) entity[1]; 2838 } 2839 } 2840 2841 /** 2842 * Return an external entity's system identifier. 2843 * @param ename The name of the external entity. 2844 * @return The entity's system identifier, or null if the 2845 * entity was not declared, or if it is not an 2846 * external entity. 2847 * @see #getEntityType 2848 */ 2849 public String getEntitySystemId(String ename) { 2850 Object[] entity = (Object[]) entityInfo.get(ename); 2851 2852 if (entity == null) { 2853 return null; 2854 } else { 2855 return (String) entity[2]; 2856 } 2857 } 2858 2859 /** 2860 * Return the value of an internal entity. 2861 * @param ename The name of the internal entity. 2862 * @return The entity's value, or null if the entity was 2863 * not declared, or if it is not an internal entity. 2864 * @see #getEntityType 2865 */ 2866 public String getEntityValue(String ename) { 2867 Object[] entity = (Object[]) entityInfo.get(ename); 2868 2869 if (entity == null) { 2870 return null; 2871 } else { 2872 return (String) entity[3]; 2873 } 2874 } 2875 2876 /** 2877 * Get the notation name associated with an NDATA entity. 2878 * @param eName The NDATA entity name. 2879 * @return The associated notation name, or null if the 2880 * entity was not declared, or if it is not an 2881 * NDATA entity. 2882 * @see #getEntityType 2883 */ 2884 public String getEntityNotationName(String eName) { 2885 Object[] entity = (Object[]) entityInfo.get(eName); 2886 2887 if (entity == null) { 2888 return null; 2889 } else { 2890 return (String) entity[4]; 2891 } 2892 } 2893 2894 /** 2895 * Register an entity declaration for later retrieval. 2896 */ 2897 void setInternalEntity(String eName, String value) { 2898 setEntity(eName, ENTITY_INTERNAL, null, null, value, null); 2899 } 2900 2901 /** 2902 * Register an external data entity. 2903 */ 2904 void setExternalDataEntity(String eName, String pubid, String sysid, 2905 String nName) { 2906 setEntity(eName, ENTITY_NDATA, pubid, sysid, null, nName); 2907 } 2908 2909 /** 2910 * Register an external text entity. 2911 */ 2912 void setExternalTextEntity(String eName, String pubid, String sysid) { 2913 setEntity(eName, ENTITY_TEXT, pubid, sysid, null, null); 2914 } 2915 2916 /** 2917 * Register an entity declaration for later retrieval. 2918 */ 2919 void setEntity(String eName, int eClass, String pubid, String sysid, 2920 String value, String nName) { 2921 Object[] entity; 2922 2923 if (entityInfo.get(eName) == null) { 2924 entity = new Object[5]; 2925 entity[0] = Integer.valueOf(eClass); 2926 entity[1] = pubid; 2927 entity[2] = sysid; 2928 entity[3] = value; 2929 entity[4] = nName; 2930 2931 entityInfo.put(eName, entity); 2932 } 2933 } 2934 2935 // 2936 // Notations. 2937 // 2938 2939 /** 2940 * Get declared notations. 2941 * @return An Enumeration of all the notations declared for 2942 * this XML document. The results will be valid only 2943 * after the DTD (if any) has been parsed. 2944 * @see #getNotationPublicId 2945 * @see #getNotationSystemId 2946 */ 2947 public Enumeration declaredNotations() { 2948 return notationInfo.keys(); 2949 } 2950 2951 /** 2952 * Look up the public identifier for a notation. 2953 * You will normally use this method to look up a notation 2954 * that was provided as an attribute value or for an NDATA entity. 2955 * @param nname The name of the notation. 2956 * @return A string containing the public identifier, or null 2957 * if none was provided or if no such notation was 2958 * declared. 2959 * @see #getNotationSystemId 2960 */ 2961 public String getNotationPublicId(String nname) { 2962 Object[] notation = (Object[]) notationInfo.get(nname); 2963 2964 if (notation == null) { 2965 return null; 2966 } else { 2967 return (String) notation[0]; 2968 } 2969 } 2970 2971 /** 2972 * Look up the system identifier for a notation. 2973 * You will normally use this method to look up a notation 2974 * that was provided as an attribute value or for an NDATA entity. 2975 * @param nname The name of the notation. 2976 * @return A string containing the system identifier, or null 2977 * if no such notation was declared. 2978 * @see #getNotationPublicId 2979 */ 2980 public String getNotationSystemId(String nname) { 2981 Object[] notation = (Object[]) notationInfo.get(nname); 2982 2983 if (notation == null) { 2984 return null; 2985 } else { 2986 return (String) notation[1]; 2987 } 2988 } 2989 2990 /** 2991 * Register a notation declaration for later retrieval. 2992 * Format: 2993 * - public id 2994 * - system id 2995 */ 2996 void setNotation(String nname, String pubid, String sysid) 2997 throws java.lang.Exception { 2998 Object[] notation; 2999 3000 if (notationInfo.get(nname) == null) { 3001 notation = new Object[2]; 3002 notation[0] = pubid; 3003 notation[1] = sysid; 3004 notationInfo.put(nname, notation); 3005 } else { 3006 error("multiple declarations of notation", nname, null); 3007 } 3008 } 3009 3010 // 3011 // Location. 3012 // 3013 3014 /** 3015 * Return the current line number. 3016 * @return The current line number. 3017 */ 3018 public int getLineNumber() { 3019 return line; 3020 } 3021 3022 /** 3023 * Return the current column number. 3024 * @return The current column number. 3025 */ 3026 public int getColumnNumber() { 3027 return column; 3028 } 3029 3030 ////////////////////////////////////////////////////////////////////// 3031 // High-level I/O. 3032 ////////////////////////////////////////////////////////////////////// 3033 3034 /** 3035 * Read a single character from the readBuffer. 3036 * <p>The readDataChunk() method maintains the buffer. 3037 * <p>If we hit the end of an entity, try to pop the stack and 3038 * keep going. 3039 * <p>(This approach doesn't really enforce XML's rules about 3040 * entity boundaries, but this is not currently a validating 3041 * parser). 3042 * <p>This routine also attempts to keep track of the current 3043 * position in external entities, but it's not entirely accurate. 3044 * @return The next available input character. 3045 * @see #unread(char) 3046 * @see #readDataChunk 3047 * @see #readBuffer 3048 * @see #line 3049 */ 3050 char readCh() throws java.lang.Exception { 3051 char c; 3052 3053 // As long as there's nothing in the 3054 // read buffer, try reading more data 3055 // (for an external entity) or popping 3056 // the entity stack (for either). 3057 while (readBufferPos >= readBufferLength) { 3058 switch (sourceType) { 3059 case INPUT_READER: 3060 case INPUT_EXTERNAL: 3061 case INPUT_STREAM: 3062 readDataChunk(); 3063 3064 while (readBufferLength < 1) { 3065 popInput(); 3066 3067 if (readBufferLength < 1) { 3068 readDataChunk(); 3069 } 3070 } 3071 3072 break; 3073 3074 default: 3075 popInput(); 3076 break; 3077 } 3078 } 3079 3080 c = readBuffer[readBufferPos++]; 3081 3082 // This is a particularly nasty bit 3083 // of code, that checks for a parameter 3084 // entity reference but peeks ahead to 3085 // catch the '%' in parameter entity 3086 // declarations. 3087 if (c == '%' 3088 && (context == CONTEXT_DTD || context == CONTEXT_ENTITYVALUE)) { 3089 char c2 = readCh(); 3090 unread(c2); 3091 3092 if (!isWhitespace(c2)) { 3093 parsePEReference(context == CONTEXT_ENTITYVALUE); 3094 return readCh(); 3095 } 3096 } 3097 3098 if (c == '\n') { 3099 line++; 3100 column = 0; 3101 } else { 3102 column++; 3103 } 3104 3105 return c; 3106 } 3107 3108 /** 3109 * Push a single character back onto the current input stream. 3110 * <p>This method usually pushes the character back onto 3111 * the readBuffer. 3112 * <p>I don't think that this would ever be called with 3113 * readBufferPos = 0, because the methods always reads a character 3114 * before unreading it, but just in case, I've added a boundary 3115 * condition. 3116 * @param c The character to push back. 3117 * @see #readCh 3118 * @see #unread(char[], int) 3119 * @see #readBuffer 3120 */ 3121 void unread(char c) throws java.lang.Exception { 3122 // Normal condition. 3123 if (c == '\n') { 3124 line--; 3125 column = -1; 3126 } 3127 3128 if (readBufferPos > 0) { 3129 readBuffer[--readBufferPos] = c; 3130 } else { 3131 pushString(null, Character.toString(c)); 3132 } 3133 } 3134 3135 /** 3136 * Push a char array back onto the current input stream. 3137 * <p>NOTE: you must <em>never</em> push back characters that you 3138 * haven't actually read: use pushString() instead. 3139 * @see #readCh 3140 * @see #unread(char) 3141 * @see #readBuffer 3142 * @see #pushString 3143 */ 3144 void unread(char[] ch, int length) throws java.lang.Exception { 3145 for (int i = 0; i < length; i++) { 3146 if (ch[i] == '\n') { 3147 line--; 3148 column = -1; 3149 } 3150 } 3151 3152 if (length < readBufferPos) { 3153 readBufferPos -= length; 3154 } else { 3155 pushCharArray(null, ch, 0, length); 3156 sourceType = INPUT_BUFFER; 3157 } 3158 } 3159 3160 /** 3161 * Push a new external input source. 3162 * <p>The source will be either an external text entity, or the DTD 3163 * external subset. 3164 * <p>TO DO: Right now, this method always attempts to autodetect 3165 * the encoding; in the future, it should allow the caller to 3166 * request an encoding explicitly, and it should also look at the 3167 * headers with an HTTP connection. 3168 * @param ename 3169 * @param publicId 3170 * @param systemId 3171 * @param reader 3172 * @param stream 3173 * @param encoding 3174 * @exception Exception 3175 * @see XmlHandler#resolveEntity 3176 * @see #pushString 3177 * @see #sourceType 3178 * @see #pushInput 3179 * @see #detectEncoding 3180 * @see #sourceType 3181 * @see #readBuffer 3182 */ 3183 void pushURL(String ename, String publicId, String systemId, Reader reader, 3184 InputStream stream, String encoding) throws java.lang.Exception { 3185 URL url; 3186 boolean ignoreEncoding = false; 3187 3188 // Push the existing status. 3189 pushInput(ename); 3190 3191 // Create a new read buffer. 3192 // (Note the four-character margin) 3193 readBuffer = new char[READ_BUFFER_MAX + 4]; 3194 readBufferPos = 0; 3195 readBufferLength = 0; 3196 readBufferOverflow = -1; 3197 is = null; 3198 line = 1; 3199 3200 currentByteCount = 0; 3201 3202 // Flush any remaining data. 3203 dataBufferFlush(); 3204 3205 // Make the URL absolute. 3206 if (systemId != null && externalEntity != null) { 3207 systemId = new URL(externalEntity.getURL(), systemId).toString(); 3208 } else if (baseURI != null) { 3209 try { 3210 systemId = new URL(new URL(baseURI), systemId).toString(); 3211 } catch (Throwable throwable) { 3212 // Ignore this and stick with the old systemId 3213 } 3214 } 3215 3216 // See if the application wants to 3217 // redirect the system ID and/or 3218 // supply its own character stream. 3219 if (systemId != null && handler != null) { 3220 Object input = handler.resolveEntity(publicId, systemId); 3221 3222 if (input != null) { 3223 if (input instanceof String) { 3224 systemId = (String) input; 3225 } else if (input instanceof InputStream) { 3226 stream = (InputStream) input; 3227 } else if (input instanceof Reader) { 3228 reader = (Reader) input; 3229 } 3230 } 3231 } 3232 3233 // Start the entity. 3234 if (handler != null) { 3235 if (systemId != null) { 3236 handler.startExternalEntity(systemId); 3237 } else { 3238 handler.startExternalEntity("[external stream]"); 3239 } 3240 } 3241 3242 // Figure out what we're reading from. 3243 if (reader != null) { 3244 // There's an explicit character stream. 3245 sourceType = INPUT_READER; 3246 this.reader = reader; 3247 tryEncodingDecl(true); 3248 return; 3249 } else if (stream != null) { 3250 sourceType = INPUT_STREAM; 3251 is = stream; 3252 } else { 3253 // We have to open our own stream 3254 // to the URL. 3255 // Set the new status 3256 sourceType = INPUT_EXTERNAL; 3257 url = new URL(systemId); 3258 3259 externalEntity = url.openConnection(); 3260 externalEntity.connect(); 3261 is = externalEntity.getInputStream(); 3262 } 3263 3264 // If we get to here, there must be 3265 // an InputStream available. 3266 if (!is.markSupported()) { 3267 is = new BufferedInputStream(is); 3268 } 3269 3270 // Attempt to detect the encoding. 3271 if (encoding == null && externalEntity != null) { 3272 encoding = externalEntity.getContentEncoding(); 3273 } 3274 3275 if (encoding != null) { 3276 checkEncoding(encoding, false); 3277 ignoreEncoding = true; 3278 } else { 3279 detectEncoding(); 3280 ignoreEncoding = false; 3281 } 3282 3283 // Read an XML or text declaration. 3284 tryEncodingDecl(ignoreEncoding); 3285 } 3286 3287 /** 3288 * Check for an encoding declaration. 3289 */ 3290 void tryEncodingDecl(boolean ignoreEncoding) throws java.lang.Exception { 3291 // Read the XML/Encoding declaration. 3292 if (tryRead("<?xml")) { 3293 if (tryWhitespace()) { 3294 if (inputStack.size() > 0) { 3295 parseTextDecl(ignoreEncoding); 3296 } else { 3297 parseXMLDecl(ignoreEncoding); 3298 } 3299 } else { 3300 unread("xml".toCharArray(), 3); 3301 parsePI(); 3302 } 3303 } 3304 } 3305 3306 /** 3307 * Attempt to detect the encoding of an entity. 3308 * <p>The trick here (as suggested in the XML standard) is that 3309 * any entity not in UTF-8, or in UCS-2 with a byte-order mark, 3310 * <b>must</b> begin with an XML declaration or an encoding 3311 * declaration; we simply have to look for "<?XML" in various 3312 * encodings. 3313 * <p>This method has no way to distinguish among 8-bit encodings. 3314 * Instead, it assumes UTF-8, then (possibly) revises its assumption 3315 * later in checkEncoding(). Any ASCII-derived 8-bit encoding 3316 * should work, but most will be rejected later by checkEncoding(). 3317 * <p>I don't currently detect EBCDIC, since I'm concerned that it 3318 * could also be a valid UTF-8 sequence; I'll have to do more checking 3319 * later. 3320 * @see #tryEncoding(byte[], byte, byte, byte, byte) 3321 * @see #tryEncoding(byte[], byte, byte) 3322 * @see #checkEncoding 3323 * @see #read8bitEncodingDeclaration 3324 */ 3325 void detectEncoding() throws java.lang.Exception { 3326 byte[] signature = new byte[4]; 3327 3328 // Read the first four bytes for 3329 // autodetection. 3330 is.mark(4); 3331 int bytesRead = is.read(signature); 3332 if (bytesRead != signature.length) { 3333 throw new IOException("Read only " + bytesRead 3334 + " bytes instead of " + signature.length); 3335 3336 } 3337 is.reset(); 3338 3339 // Look for a known signature. 3340 if (tryEncoding(signature, (byte) 0x00, (byte) 0x00, (byte) 0x00, 3341 (byte) 0x3c)) { 3342 // UCS-4 must begin with "<!XML" 3343 // 0x00 0x00 0x00 0x3c: UCS-4, big-endian (1234) 3344 encoding = ENCODING_UCS_4_1234; 3345 } else if (tryEncoding(signature, (byte) 0x3c, (byte) 0x00, (byte) 0x00, 3346 (byte) 0x00)) { 3347 // UCS-4 must begin with "<!XML" 3348 // 0x3c 0x00 0x00 0x00: UCS-4, little-endian (4321) 3349 encoding = ENCODING_UCS_4_4321; 3350 } else if (tryEncoding(signature, (byte) 0x00, (byte) 0x00, (byte) 0x3c, 3351 (byte) 0x00)) { 3352 // UCS-4 must begin with "<!XML" 3353 // 0x00 0x00 0x3c 0x00: UCS-4, unusual (2143) 3354 encoding = ENCODING_UCS_4_2143; 3355 } else if (tryEncoding(signature, (byte) 0x00, (byte) 0x3c, (byte) 0x00, 3356 (byte) 0x00)) { 3357 // UCS-4 must begin with "<!XML" 3358 // 0x00 0x3c 0x00 0x00: UCS-4, unusual (3421) 3359 encoding = ENCODING_UCS_4_3412; 3360 } else if (tryEncoding(signature, (byte) 0xfe, (byte) 0xff)) { 3361 // UCS-2 with a byte-order marker. 3362 // 0xfe 0xff: UCS-2, big-endian (12) 3363 encoding = ENCODING_UCS_2_12; 3364 is.read(); 3365 is.read(); 3366 } else if (tryEncoding(signature, (byte) 0xff, (byte) 0xfe)) { 3367 // UCS-2 with a byte-order marker. 3368 // 0xff 0xfe: UCS-2, little-endian (21) 3369 encoding = ENCODING_UCS_2_21; 3370 is.read(); 3371 is.read(); 3372 } else if (tryEncoding(signature, (byte) 0x00, (byte) 0x3c, (byte) 0x00, 3373 (byte) 0x3f)) { 3374 // UCS-2 without a BOM must begin with "<?XML" 3375 // 0x00 0x3c 0x00 0x3f: UCS-2, big-endian, no byte-order mark 3376 encoding = ENCODING_UCS_2_12; 3377 error("no byte-order mark for UCS-2 entity", null, null); 3378 } else if (tryEncoding(signature, (byte) 0x3c, (byte) 0x00, (byte) 0x3f, 3379 (byte) 0x00)) { 3380 // UCS-2 without a BOM must begin with "<?XML" 3381 // 0x3c 0x00 0x3f 0x00: UCS-2, little-endian, no byte-order mark 3382 encoding = ENCODING_UCS_2_21; 3383 error("no byte-order mark for UCS-2 entity", null, null); 3384 } else if (tryEncoding(signature, (byte) 0x3c, (byte) 0x3f, (byte) 0x78, 3385 (byte) 0x6d)) { 3386 // Some kind of 8-bit encoding with "<?XML" 3387 // 0x3c 0x3f 0x78 0x6d: UTF-8 or other 8-bit markup (read ENCODING) 3388 encoding = ENCODING_UTF_8; 3389 read8bitEncodingDeclaration(); 3390 } else { 3391 // Some kind of 8-bit encoding without "<?XML" 3392 // (otherwise) UTF-8 without encoding/XML declaration 3393 encoding = ENCODING_UTF_8; 3394 } 3395 } 3396 3397 /** 3398 * Check for a four-byte signature. 3399 * <p>Utility routine for detectEncoding(). 3400 * <p>Always looks for some part of "<?XML" in a specific encoding. 3401 * @param sig The first four bytes read. 3402 * @param b1 The first byte of the signature 3403 * @param b2 The second byte of the signature 3404 * @param b3 The third byte of the signature 3405 * @param b4 The fourth byte of the signature 3406 * @see #detectEncoding 3407 */ 3408 boolean tryEncoding(byte[] sig, byte b1, byte b2, byte b3, byte b4) { 3409 return sig[0] == b1 && sig[1] == b2 && sig[2] == b3 && sig[3] == b4; 3410 } 3411 3412 /** 3413 * Check for a two-byte signature. 3414 * <p>Looks for a UCS-2 byte-order mark. 3415 * <p>Utility routine for detectEncoding(). 3416 * @param sig The first four bytes read. 3417 * @param b1 The first byte of the signature 3418 * @param b2 The second byte of the signature 3419 * @see #detectEncoding 3420 */ 3421 boolean tryEncoding(byte[] sig, byte b1, byte b2) { 3422 return sig[0] == b1 && sig[1] == b2; 3423 } 3424 3425 /** 3426 * This method pushes a string back onto input. 3427 * <p>It is useful either as the expansion of an internal entity, 3428 * or for backtracking during the parse. 3429 * <p>Call pushCharArray() to do the actual work. 3430 * @param s The string to push back onto input. 3431 * @see #pushCharArray 3432 */ 3433 void pushString(String ename, String s) throws java.lang.Exception { 3434 char[] ch = s.toCharArray(); 3435 pushCharArray(ename, ch, 0, ch.length); 3436 } 3437 3438 /** 3439 * Push a new internal input source. 3440 * <p>This method is useful for expanding an internal entity, 3441 * or for unreading a string of characters. It creates a new 3442 * readBuffer containing the characters in the array, instead 3443 * of characters converted from an input byte stream. 3444 * <p>I've added a couple of optimisations: don't push zero- 3445 * length strings, and just push back a single character 3446 * for 1-character strings; this should save some time and memory. 3447 * @param ch The char array to push. 3448 * @see #pushString 3449 * @see #pushURL 3450 * @see #readBuffer 3451 * @see #sourceType 3452 * @see #pushInput 3453 */ 3454 void pushCharArray(String ename, char[] ch, int start, int length) 3455 throws java.lang.Exception { 3456 // Push the existing status 3457 pushInput(ename); 3458 sourceType = INPUT_INTERNAL; 3459 readBuffer = ch; 3460 readBufferPos = start; 3461 readBufferLength = length; 3462 readBufferOverflow = -1; 3463 } 3464 3465 /** 3466 * Save the current input source onto the stack. 3467 * <p>This method saves all of the global variables associated with 3468 * the current input source, so that they can be restored when a new 3469 * input source has finished. It also tests for entity recursion. 3470 * <p>The method saves the following global variables onto a stack 3471 * using a fixed-length array: 3472 * <ol> 3473 * <li>sourceType</li> 3474 * <li>externalEntity</li> 3475 * <li>readBuffer</li> 3476 * <li>readBufferPos</li> 3477 * <li>readBufferLength</li> 3478 * <li>line</li> 3479 * <li>encoding</li> 3480 * </ol> 3481 * @param ename The name of the entity (if any) causing the new input. 3482 * @see #popInput 3483 * @see #sourceType 3484 * @see #externalEntity 3485 * @see #readBuffer 3486 * @see #readBufferPos 3487 * @see #readBufferLength 3488 * @see #line 3489 * @see #encoding 3490 */ 3491 void pushInput(String ename) throws java.lang.Exception { 3492 Object[] input = new Object[12]; 3493 3494 // Check for entity recursion. 3495 if (ename != null) { 3496 Enumeration entities = entityStack.elements(); 3497 3498 while (entities.hasMoreElements()) { 3499 String e = (String) entities.nextElement(); 3500 3501 if (e.equals(ename)) { 3502 error("recursive reference to entity", ename, null); 3503 } 3504 } 3505 } 3506 3507 entityStack.push(ename); 3508 3509 // Don't bother if there is no input. 3510 if (sourceType == INPUT_NONE) { 3511 return; 3512 } 3513 3514 // Set up a snapshot of the current 3515 // input source. 3516 input[0] = Integer.valueOf(sourceType); 3517 input[1] = externalEntity; 3518 input[2] = readBuffer; 3519 input[3] = Integer.valueOf(readBufferPos); 3520 input[4] = Integer.valueOf(readBufferLength); 3521 input[5] = Integer.valueOf(line); 3522 input[6] = Integer.valueOf(encoding); 3523 input[7] = Integer.valueOf(readBufferOverflow); 3524 input[8] = is; 3525 input[9] = Integer.valueOf(currentByteCount); 3526 input[10] = Integer.valueOf(column); 3527 input[11] = reader; 3528 3529 // Push it onto the stack. 3530 inputStack.push(input); 3531 } 3532 3533 /** 3534 * Restore a previous input source. 3535 * <p>This method restores all of the global variables associated with 3536 * the current input source. 3537 * @exception java.io.EOFException 3538 * If there are no more entries on the input stack. 3539 * @see #pushInput 3540 * @see #sourceType 3541 * @see #externalEntity 3542 * @see #readBuffer 3543 * @see #readBufferPos 3544 * @see #readBufferLength 3545 * @see #line 3546 * @see #encoding 3547 */ 3548 void popInput() throws java.lang.Exception { 3549 Object[] input; 3550 3551 switch (sourceType) { 3552 case INPUT_EXTERNAL: 3553 dataBufferFlush(); 3554 3555 if (handler != null && externalEntity != null) { 3556 handler.endExternalEntity(externalEntity.getURL().toString()); 3557 } 3558 3559 break; 3560 3561 case INPUT_STREAM: 3562 dataBufferFlush(); 3563 3564 if (baseURI != null) { 3565 if (handler != null) { 3566 handler.endExternalEntity(baseURI); 3567 } 3568 } 3569 3570 break; 3571 3572 case INPUT_READER: 3573 dataBufferFlush(); 3574 3575 if (baseURI != null) { 3576 if (handler != null) { 3577 handler.endExternalEntity(baseURI); 3578 } 3579 } 3580 3581 break; 3582 } 3583 3584 // Throw an EOFException if there 3585 // is nothing else to pop. 3586 if (inputStack.isEmpty()) { 3587 throw new EOFException("XML parser input stack was empty, " 3588 + "end of file or xml fragment reached. " 3589 + "Perhaps there is a missing '>' " 3590 + "or a comment is unterminated by '->'?"); 3591 } else { 3592 input = (Object[]) inputStack.pop(); 3593 entityStack.pop(); 3594 } 3595 3596 sourceType = ((Integer) input[0]).intValue(); 3597 externalEntity = (URLConnection) input[1]; 3598 readBuffer = (char[]) input[2]; 3599 readBufferPos = ((Integer) input[3]).intValue(); 3600 readBufferLength = ((Integer) input[4]).intValue(); 3601 line = ((Integer) input[5]).intValue(); 3602 encoding = ((Integer) input[6]).intValue(); 3603 readBufferOverflow = ((Integer) input[7]).intValue(); 3604 is = (InputStream) input[8]; 3605 currentByteCount = ((Integer) input[9]).intValue(); 3606 column = ((Integer) input[10]).intValue(); 3607 reader = (Reader) input[11]; 3608 } 3609 3610 /** 3611 * Return true if we can read the expected character. 3612 * <p>Note that the character will be removed from the input stream 3613 * on success, but will be put back on failure. Do not attempt to 3614 * read the character again if the method succeeds. 3615 * @param delim The character that should appear next. For a 3616 * insensitive match, you must supply this in upper-case. 3617 * @return true if the character was successfully read, or false if 3618 * it was not. 3619 * @see #tryRead(String) 3620 */ 3621 boolean tryRead(char delim) throws java.lang.Exception { 3622 char c; 3623 3624 // Read the character 3625 c = readCh(); 3626 3627 // Test for a match, and push the character 3628 // back if the match fails. 3629 if (c == delim) { 3630 return true; 3631 } else { 3632 unread(c); 3633 return false; 3634 } 3635 } 3636 3637 /** 3638 * Return true if we can read the expected string. 3639 * <p>This is simply a convenience method. 3640 * <p>Note that the string will be removed from the input stream 3641 * on success, but will be put back on failure. Do not attempt to 3642 * read the string again if the method succeeds. 3643 * <p>This method will push back a character rather than an 3644 * array whenever possible (probably the majority of cases). 3645 * <p><b>NOTE:</b> This method currently has a hard-coded limit 3646 * of 100 characters for the delimiter. 3647 * @param delim The string that should appear next. 3648 * @return true if the string was successfully read, or false if 3649 * it was not. 3650 * @see #tryRead(char) 3651 */ 3652 boolean tryRead(String delim) throws java.lang.Exception { 3653 char[] ch = delim.toCharArray(); 3654 char c; 3655 3656 // Compare the input, character- 3657 // by character. 3658 for (int i = 0; i < ch.length; i++) { 3659 c = readCh(); 3660 3661 if (c != ch[i]) { 3662 unread(c); 3663 3664 if (i != 0) { 3665 unread(ch, i); 3666 } 3667 3668 return false; 3669 } 3670 } 3671 3672 return true; 3673 } 3674 3675 /** 3676 * Return true if we can read some whitespace. 3677 * <p>This is simply a convenience method. 3678 * <p>This method will push back a character rather than an 3679 * array whenever possible (probably the majority of cases). 3680 * @return true if whitespace was found. 3681 */ 3682 boolean tryWhitespace() throws java.lang.Exception { 3683 char c; 3684 c = readCh(); 3685 3686 if (isWhitespace(c)) { 3687 skipWhitespace(); 3688 return true; 3689 } else { 3690 unread(c); 3691 return false; 3692 } 3693 } 3694 3695 /** 3696 * Read all data until we find the specified string. 3697 * <p>This is especially useful for scanning marked sections. 3698 * <p>This is a a little inefficient right now, since it calls tryRead() 3699 * for every character. 3700 * @param delim The string delimiter 3701 * @see #tryRead(String) 3702 * @see #readCh 3703 */ 3704 void parseUntil(String delim) throws java.lang.Exception { 3705 char c; 3706 int startLine = line; 3707 3708 try { 3709 while (!tryRead(delim)) { 3710 c = readCh(); 3711 dataBufferAppend(c); 3712 } 3713 } catch (EOFException e) { 3714 error("end of input while looking for delimiter (started on line " 3715 + startLine + ')', null, delim); 3716 } 3717 } 3718 3719 // Modified November 14, 1998 by Steve Neuendorffer 3720 // There was a bug because this was not skipping things that looked 3721 // like parameter entities properly. 3722 // Copied the appropriate code from readCh, excluding the lines referring to 3723 // '%'. 3724 3725 /** 3726 * Skip all data until we find the specified string. 3727 * <p>This is especially useful for scanning comments. 3728 * <p>This is a a little inefficient right now, since it calls tryRead() 3729 * for every character. 3730 * @param delim The string delimiter 3731 * @see #readCh 3732 */ 3733 void skipUntil(String delim) throws java.lang.Exception { 3734 while (!tryRead(delim)) { 3735 char c; 3736 3737 // As long as there's nothing in the 3738 // read buffer, try reading more data 3739 // (for an external entity) or popping 3740 // the entity stack (for either). 3741 while (readBufferPos >= readBufferLength) { 3742 switch (sourceType) { 3743 case INPUT_READER: 3744 case INPUT_EXTERNAL: 3745 case INPUT_STREAM: 3746 readDataChunk(); 3747 3748 while (readBufferLength < 1) { 3749 popInput(); 3750 3751 if (readBufferLength < 1) { 3752 readDataChunk(); 3753 } 3754 } 3755 3756 break; 3757 3758 default: 3759 popInput(); 3760 break; 3761 } 3762 } 3763 3764 c = readBuffer[readBufferPos++]; 3765 3766 if (c == '\n') { 3767 line++; 3768 column = 0; 3769 } else { 3770 column++; 3771 } 3772 } 3773 } 3774 3775 /** 3776 * Read just the encoding declaration (or XML declaration) at the 3777 * start of an external entity. 3778 * When this method is called, we know that the declaration is 3779 * present (or appears to be). We also know that the entity is 3780 * in some sort of ASCII-derived 8-bit encoding. 3781 * The idea of this is to let us read what the 8-bit encoding is 3782 * before we've committed to converting any more of the file; the 3783 * XML or encoding declaration must be in 7-bit ASCII, so we're 3784 * safe as long as we don't go past it. 3785 */ 3786 void read8bitEncodingDeclaration() throws java.lang.Exception { 3787 int ch; 3788 readBufferPos = readBufferLength = 0; 3789 3790 while (true) { 3791 ch = is.read(); 3792 readBuffer[readBufferLength++] = (char) ch; 3793 3794 switch (ch) { 3795 case '>': 3796 return; 3797 3798 case -1: 3799 error("end of file before end of XML or encoding declaration.", 3800 null, "?>"); 3801 return; 3802 } 3803 3804 if (readBuffer.length == readBufferLength) { 3805 error("unfinished XML or encoding declaration", null, null); 3806 } 3807 } 3808 } 3809 3810 ////////////////////////////////////////////////////////////////////// 3811 // Low-level I/O. 3812 ////////////////////////////////////////////////////////////////////// 3813 3814 /** 3815 * Read a chunk of data from an external input source. 3816 * <p>This is simply a front-end that fills the rawReadBuffer 3817 * with bytes, then calls the appropriate encoding handler. 3818 * @see #encoding 3819 * @see #rawReadBuffer 3820 * @see #readBuffer 3821 * @see #filterCR 3822 * @see #copyUtf8ReadBuffer 3823 * @see #copyIso8859_1ReadBuffer 3824 */ 3825 void readDataChunk() throws java.lang.Exception { 3826 int count; 3827 3828 // See if we have any overflow. 3829 if (readBufferOverflow > -1) { 3830 readBuffer[0] = (char) readBufferOverflow; 3831 readBufferOverflow = -1; 3832 readBufferPos = 1; 3833 sawCR = true; 3834 } else { 3835 readBufferPos = 0; 3836 sawCR = false; 3837 } 3838 3839 // Special situation -- we're taking 3840 // input from a character stream. 3841 if (sourceType == INPUT_READER) { 3842 count = reader.read(readBuffer, readBufferPos, READ_BUFFER_MAX - 1); 3843 3844 if (count < 0) { 3845 readBufferLength = -1; 3846 } else { 3847 readBufferLength = readBufferPos + count; 3848 filterCR(); 3849 sawCR = false; 3850 } 3851 3852 return; 3853 } 3854 3855 // Read as many bytes as possible 3856 // into the read buffer. 3857 count = is.read(rawReadBuffer, 0, READ_BUFFER_MAX); 3858 3859 // Dispatch to an encoding-specific 3860 // reader method to populate the 3861 // readBuffer. 3862 switch (encoding) { 3863 case ENCODING_UTF_8: 3864 copyUtf8ReadBuffer(count); 3865 break; 3866 3867 case ENCODING_ISO_8859_1: 3868 copyIso8859_1ReadBuffer(count); 3869 break; 3870 3871 case ENCODING_UCS_2_12: 3872 copyUcs2ReadBuffer(count, 8, 0); 3873 break; 3874 3875 case ENCODING_UCS_2_21: 3876 copyUcs2ReadBuffer(count, 0, 8); 3877 break; 3878 3879 case ENCODING_UCS_4_1234: 3880 copyUcs4ReadBuffer(count, 24, 16, 8, 0); 3881 break; 3882 3883 case ENCODING_UCS_4_4321: 3884 copyUcs4ReadBuffer(count, 0, 8, 16, 24); 3885 break; 3886 3887 case ENCODING_UCS_4_2143: 3888 copyUcs4ReadBuffer(count, 16, 24, 0, 8); 3889 break; 3890 3891 case ENCODING_UCS_4_3412: 3892 copyUcs4ReadBuffer(count, 8, 0, 24, 16); 3893 break; 3894 } 3895 3896 // Filter out all carriage returns 3897 // if we've seen any. 3898 if (sawCR) { 3899 filterCR(); 3900 sawCR = false; 3901 } 3902 3903 // Reset the position. 3904 readBufferPos = 0; 3905 currentByteCount += count; 3906 } 3907 3908 /** 3909 * Filter carriage returns in the read buffer. 3910 * <p>CRLF becomes LF; CR becomes LF. 3911 * @see #readDataChunk 3912 * @see #readBuffer 3913 * @see #readBufferOverflow 3914 */ 3915 void filterCR() { 3916 int i; 3917 int j; 3918 3919 readBufferOverflow = -1; 3920 3921 loop: for (i = 0, j = 0; j < readBufferLength; i++, j++) { 3922 switch (readBuffer[j]) { 3923 case '\r': 3924 3925 if (j == readBufferLength - 1) { 3926 readBufferOverflow = '\r'; 3927 readBufferLength--; 3928 break loop; 3929 } else if (readBuffer[j + 1] == '\n') { 3930 j++; 3931 } 3932 3933 readBuffer[i] = '\n'; 3934 break; 3935 3936 case '\n': 3937 default: 3938 readBuffer[i] = readBuffer[j]; 3939 break; 3940 } 3941 } 3942 3943 readBufferLength = i; 3944 } 3945 3946 /** 3947 * Convert a buffer of UTF-8-encoded bytes into UTF-16 characters. 3948 * <p>When readDataChunk() calls this method, the raw bytes are in 3949 * rawReadBuffer, and the final characters will appear in 3950 * readBuffer. 3951 * <p>The tricky part of this is dealing with UTF-8 multi-byte 3952 * sequences, but it doesn't seem to slow things down too much. 3953 * @param count The number of bytes to convert. 3954 * @see #readDataChunk 3955 * @see #rawReadBuffer 3956 * @see #readBuffer 3957 * @see #getNextUtf8Byte 3958 */ 3959 void copyUtf8ReadBuffer(int count) throws java.lang.Exception { 3960 int i = 0; 3961 int j = readBufferPos; 3962 int b1; 3963 3964 while (i < count) { 3965 b1 = rawReadBuffer[i++]; 3966 3967 // Determine whether we are dealing 3968 // with a one-, two-, three-, or four- 3969 // byte sequence. 3970 if ((b1 & 0x80) == 0) { 3971 // 1-byte sequence: 000000000xxxxxxx = 0xxxxxxx 3972 readBuffer[j++] = (char) b1; 3973 } else if ((b1 & 0xe0) == 0xc0) { 3974 // 2-byte sequence: 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx 3975 readBuffer[j++] = (char) ((b1 & 0x1f) << 6 3976 | getNextUtf8Byte(i++, count)); 3977 } else if ((b1 & 0xf0) == 0xe0) { 3978 // 3-byte sequence: zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx 3979 readBuffer[j++] = (char) ((b1 & 0x0f) << 12 3980 | getNextUtf8Byte(i++, count) << 6 3981 | getNextUtf8Byte(i++, count)); 3982 } else if ((b1 & 0xf8) == 0xf0) { 3983 // 4-byte sequence: 11101110wwwwzzzzyy + 110111yyyyxxxxxx 3984 // = 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx 3985 // (uuuuu = wwww + 1) 3986 int b2 = getNextUtf8Byte(i++, count); 3987 int b3 = getNextUtf8Byte(i++, count); 3988 int b4 = getNextUtf8Byte(i++, count); 3989 readBuffer[j++] = (char) (0xd800 3990 | ((b1 & 0x07) << 2 | ((b2 & 0x30) >> 4) - 1) << 6 3991 | (b2 & 0x0f) << 2 | (b3 & 0x30) >> 4); 3992 readBuffer[j++] = (char) (0xdc | (b3 & 0x0f) << 6 | b4); 3993 3994 // TODO: test that surrogate value is legal. 3995 } else { 3996 // Otherwise, the 8th bit may not be set in UTF-8 3997 encodingError("bad start for UTF-8 multi-byte sequence", b1, i); 3998 } 3999 4000 if (readBuffer[j - 1] == '\r') { 4001 sawCR = true; 4002 } 4003 } 4004 4005 // How many characters have we read? 4006 readBufferLength = j; 4007 } 4008 4009 /** 4010 * Return the next byte value in a UTF-8 sequence. 4011 * If it is not possible to get a byte from the current 4012 * entity, throw an exception. 4013 * @param pos The current position in the rawReadBuffer. 4014 * @param count The number of bytes in the rawReadBuffer 4015 * @return The significant six bits of a non-initial byte in 4016 * a UTF-8 sequence. 4017 * @exception EOFException If the sequence is incomplete. 4018 */ 4019 int getNextUtf8Byte(int pos, int count) throws java.lang.Exception { 4020 int val; 4021 4022 // Take a character from the buffer 4023 // or from the actual input stream. 4024 if (pos < count) { 4025 val = rawReadBuffer[pos]; 4026 } else { 4027 val = is.read(); 4028 4029 if (val == -1) { 4030 encodingError("unfinished multi-byte UTF-8 sequence at EOF", -1, 4031 pos); 4032 } 4033 } 4034 4035 // Check for the correct bits at the 4036 // start. 4037 if ((val & 0xc0) != 0x80) { 4038 encodingError("bad continuation of multi-byte UTF-8 sequence", val, 4039 pos + 1); 4040 } 4041 4042 // Return the significant bits. 4043 return val & 0x3f; 4044 } 4045 4046 /** 4047 * Convert a buffer of ISO-8859-1-encoded bytes into UTF-16 characters. 4048 * <p>When readDataChunk() calls this method, the raw bytes are in 4049 * rawReadBuffer, and the final characters will appear in 4050 * readBuffer. 4051 * <p>This is a direct conversion, with no tricks. 4052 * @param count The number of bytes to convert. 4053 * @see #readDataChunk 4054 * @see #rawReadBuffer 4055 * @see #readBuffer 4056 */ 4057 void copyIso8859_1ReadBuffer(int count) { 4058 int i; 4059 int j; 4060 4061 for (i = 0, j = readBufferPos; i < count; i++, j++) { 4062 readBuffer[j] = (char) (rawReadBuffer[i] & 0xff); 4063 4064 if (readBuffer[j] == '\r') { 4065 sawCR = true; 4066 } 4067 } 4068 4069 readBufferLength = j; 4070 } 4071 4072 /** 4073 * Convert a buffer of UCS-2-encoded bytes into UTF-16 characters. 4074 * <p>When readDataChunk() calls this method, the raw bytes are in 4075 * rawReadBuffer, and the final characters will appear in 4076 * readBuffer. 4077 * @param count The number of bytes to convert. 4078 * @param shift1 The number of bits to shift byte 1. 4079 * @param shift2 The number of bits to shift byte 2 4080 * @see #readDataChunk 4081 * @see #rawReadBuffer 4082 * @see #readBuffer 4083 */ 4084 void copyUcs2ReadBuffer(int count, int shift1, int shift2) 4085 throws java.lang.Exception { 4086 int j = readBufferPos; 4087 4088 if (count > 0 && count % 2 != 0) { 4089 encodingError("odd number of bytes in UCS-2 encoding", -1, count); 4090 } 4091 4092 for (int i = 0; i < count; i += 2) { 4093 readBuffer[j++] = (char) ((rawReadBuffer[i] & 0xff) << shift1 4094 | (rawReadBuffer[i + 1] & 0xff) << shift2); 4095 4096 if (readBuffer[j - 1] == '\r') { 4097 sawCR = true; 4098 } 4099 } 4100 4101 readBufferLength = j; 4102 } 4103 4104 /** 4105 * Convert a buffer of UCS-4-encoded bytes into UTF-16 characters. 4106 * <p>When readDataChunk() calls this method, the raw bytes are in 4107 * rawReadBuffer, and the final characters will appear in 4108 * readBuffer. 4109 * <p>Java has 16-bit chars, but this routine will attempt to use 4110 * surrogates to encoding values between 0x00010000 and 0x000fffff. 4111 * @param count The number of bytes to convert. 4112 * @param shift1 The number of bits to shift byte 1. 4113 * @param shift2 The number of bits to shift byte 2 4114 * @param shift3 The number of bits to shift byte 2 4115 * @param shift4 The number of bits to shift byte 2 4116 * @see #readDataChunk 4117 * @see #rawReadBuffer 4118 * @see #readBuffer 4119 */ 4120 void copyUcs4ReadBuffer(int count, int shift1, int shift2, int shift3, 4121 int shift4) throws java.lang.Exception { 4122 int j = readBufferPos; 4123 int value; 4124 4125 if (count > 0 && count % 4 != 0) { 4126 encodingError( 4127 "number of bytes in UCS-4 encoding not divisible by 4", -1, 4128 count); 4129 } 4130 4131 for (int i = 0; i < count; i += 4) { 4132 value = (rawReadBuffer[i] & 0xff) << shift1 4133 | (rawReadBuffer[i + 1] & 0xff) << shift2 4134 | (rawReadBuffer[i + 2] & 0xff) << shift3 4135 | (rawReadBuffer[i + 3] & 0xff) << shift4; 4136 4137 if (value < 0x0000ffff) { 4138 readBuffer[j++] = (char) value; 4139 4140 if (value == '\r') { 4141 sawCR = true; 4142 } 4143 } else if (value < 0x000fffff) { 4144 readBuffer[j++] = (char) (0xd8 | (value & 0x000ffc00) >> 10); 4145 readBuffer[j++] = (char) (0xdc | value & 0x0003ff); 4146 } else { 4147 encodingError("value cannot be represented in UTF-16", value, 4148 i); 4149 } 4150 } 4151 4152 readBufferLength = j; 4153 } 4154 4155 /** 4156 * Report a character encoding error. 4157 */ 4158 void encodingError(String message, int value, int offset) 4159 throws java.lang.Exception { 4160 String uri; 4161 4162 if (value >= 0) { 4163 message = message + " (byte value: 0x" + Integer.toHexString(value) 4164 + ')'; 4165 } 4166 4167 if (externalEntity != null) { 4168 uri = externalEntity.getURL().toString(); 4169 } else { 4170 uri = baseURI; 4171 } 4172 4173 handler.error(message, uri, -1, offset + currentByteCount); 4174 } 4175 4176 ////////////////////////////////////////////////////////////////////// 4177 // Local Variables. 4178 ////////////////////////////////////////////////////////////////////// 4179 4180 /** 4181 * Re-initialize the variables for each parse. 4182 */ 4183 void initializeVariables() { 4184 // No errors; first lineb 4185 //errorCount = 0; 4186 line = 1; 4187 column = 0; 4188 4189 // Set up the buffers for data and names 4190 dataBufferPos = 0; 4191 dataBuffer = new char[DATA_BUFFER_INITIAL]; 4192 nameBufferPos = 0; 4193 nameBuffer = new char[NAME_BUFFER_INITIAL]; 4194 4195 // Set up the DTD hash tables 4196 elementInfo = new Hashtable(); 4197 entityInfo = new Hashtable(); 4198 notationInfo = new Hashtable(); 4199 4200 // Set up the variables for the current 4201 // element context. 4202 currentElement = null; 4203 currentElementContent = CONTENT_UNDECLARED; 4204 4205 // Set up the input variables 4206 sourceType = INPUT_NONE; 4207 inputStack = new Stack(); 4208 entityStack = new Stack(); 4209 externalEntity = null; 4210 tagAttributePos = 0; 4211 tagAttributes = new String[100]; 4212 rawReadBuffer = new byte[READ_BUFFER_MAX]; 4213 readBufferOverflow = -1; 4214 4215 context = CONTEXT_NONE; 4216 4217 symbolTable = new Object[SYMBOL_TABLE_LENGTH]; 4218 } 4219 4220 /** 4221 * Clean up after the parse to allow some garbage collection. 4222 * Leave around anything that might be useful for queries. 4223 */ 4224 void cleanupVariables() { 4225 //errorCount = -1; 4226 line = -1; 4227 column = -1; 4228 dataBuffer = null; 4229 nameBuffer = null; 4230 currentElement = null; 4231 currentElementContent = CONTENT_UNDECLARED; 4232 sourceType = INPUT_NONE; 4233 inputStack = null; 4234 externalEntity = null; 4235 entityStack = null; 4236 } 4237 4238 // 4239 // The current XML handler interface. 4240 // 4241 XmlHandler handler; 4242 4243 // 4244 // I/O information. 4245 // 4246 private Reader reader; // current reader 4247 4248 private InputStream is; // current input stream 4249 4250 private int line; // current line number 4251 4252 private int column; // current column number 4253 4254 private int sourceType; // type of input source 4255 4256 private Stack inputStack; // stack of input sources 4257 4258 private URLConnection externalEntity; // current external entity 4259 4260 private int encoding; // current character encoding. 4261 4262 private int currentByteCount; // how many bytes read from current source. 4263 4264 // 4265 // Maintain a count of errors. 4266 // 4267 //private int errorCount; 4268 4269 // 4270 // Buffers for decoded but unparsed character input. 4271 // 4272 private final static int READ_BUFFER_MAX = 16384; 4273 4274 private char[] readBuffer; 4275 4276 private int readBufferPos; 4277 4278 private int readBufferLength; 4279 4280 private int readBufferOverflow; // overflow character from last data chunk. 4281 4282 // 4283 // Stack of entity names, to help detect recursion. 4284 // 4285 private Stack entityStack; 4286 4287 // 4288 // Buffer for undecoded raw byte input. 4289 // 4290 private byte[] rawReadBuffer; 4291 4292 // 4293 // Buffer for parsed character data. 4294 // 4295 private static int DATA_BUFFER_INITIAL = 4096; 4296 4297 private char[] dataBuffer; 4298 4299 private int dataBufferPos; 4300 4301 // 4302 // Buffer for parsed names. 4303 // 4304 private static int NAME_BUFFER_INITIAL = 1024; 4305 4306 private char[] nameBuffer; 4307 4308 private int nameBufferPos; 4309 4310 // 4311 // Hashtables for DTD information on elements, entities, and notations. 4312 // 4313 private Hashtable elementInfo; 4314 4315 private Hashtable entityInfo; 4316 4317 private Hashtable notationInfo; 4318 4319 // 4320 // Element type currently in force. 4321 // 4322 private String currentElement; 4323 4324 private int currentElementContent; 4325 4326 // 4327 // Base external identifiers for resolution. 4328 // 4329 private String basePublicId; 4330 4331 private String baseURI; 4332 4333 private Reader baseReader; 4334 4335 private InputStream baseInputStream; 4336 4337 // 4338 // Are we in a context where PEs are allowed? 4339 // 4340 private int context; 4341 4342 // 4343 // Symbol table, for internalising names. 4344 // 4345 private Object[] symbolTable; 4346 4347 private final static int SYMBOL_TABLE_LENGTH = 1087; 4348 4349 // 4350 // Hash table of attributes found in current start tag. 4351 // 4352 private String[] tagAttributes; 4353 4354 private int tagAttributePos; 4355 4356 // 4357 // Utility flag: have we noticed a CR while reading the last 4358 // data chunk? If so, we will have to go back and normalise 4359 // CR/LF. 4360 // 4361 private boolean sawCR; 4362}