Source code

001// XmlParser.java: the main parser class.
002// NO WARRANTY! See README, and copyright below.
003// $Id$
004package com.microstar.xml;
005
006import java.io.BufferedInputStream;
007import java.io.EOFException;
008import java.io.IOException;
009import java.io.InputStream;
010import java.io.Reader;
011import java.net.URL;
012import java.net.URLConnection;
013import java.util.Enumeration;
014import java.util.Hashtable;
015import java.util.Locale;
016import java.util.Stack;
017
018/**
019 * Parse XML documents and return parse events through call-backs.
020 * <p>You need to define a class implementing the <code>XmlHandler</code>
021 * interface: an object belonging to this class will receive the
022 * callbacks for the events.  (As an alternative to implementing
023 * the full XmlHandler interface, you can simply extend the
024 * <code>HandlerBase</code> convenience class.)
025 * <p>Usage (assuming that <code>MyHandler</code> is your implementation
026 * of the <code>XmlHandler</code> interface):
027 * <pre>
028 * XmlHandler handler = new MyHandler();
029 * XmlParser parser = new XmlParser();
030 * parser.setHandler(handler);
031 * try {
032 *   parser.parse("http://www.host.com/doc.xml", null);
033 * } catch (Exception e) {
034 *   [do something interesting]
035 * }
036 * </pre>
037 * <p>Alternatively, you can use the standard SAX interfaces
038 * with the <code>SAXDriver</code> class as your entry point.
039 * @author Copyright (c) 1997, 1998 by Microstar Software Ltd.
040 * @author Written by David Megginson &lt;dmeggins@microstar.com&gt;
041 * @version 1.1
042 * @since Ptolemy II 0.2
043 * @see XmlHandler
044 * @see HandlerBase
045 */
046public class XmlParser {
047    //
048    // Use special cheats that speed up the code (currently about 50%),
049    // but may cause problems with future maintenance and add to the
050    // class file size (about 500 bytes).
051    //
052    private final static boolean USE_CHEATS = true;
053
054    //////////////////////////////////////////////////////////////////////
055    // Constructors.
056    ////////////////////////////////////////////////////////////////////////
057
058    /**
059     * Construct a new parser with no associated handler.
060     * @see #setHandler
061     * @see #parse
062     */
063    public XmlParser() {
064    }
065
066    /**
067     * Set the handler that will receive parsing events.
068     * @param handler The handler to receive callback events.
069     * @see #parse
070     * @see XmlHandler
071     */
072    public void setHandler(XmlHandler handler) {
073        this.handler = handler;
074    }
075
076    /**
077     * Parse an XML document from a URI.
078     * <p>You may parse a document more than once, but only one thread
079     * may call this method for an object at one time.
080     * @param systemId The URI of the document.
081     * @param publicId The public identifier of the document, or null.
082     * @param encoding The suggested encoding, or null if unknown.
083     * @exception java.lang.Exception Any exception thrown by your
084     *            own handlers, or any derivation of java.io.IOException
085     *            thrown by the parser itself.
086     */
087    public void parse(String systemId, String publicId, String encoding)
088            throws java.lang.Exception {
089        doParse(systemId, publicId, null, null, encoding);
090    }
091
092    /**
093     * Parse an XML document from a byte stream.
094     * <p>The URI that you supply will become the base URI for
095     * resolving relative links, but &AElig;lfred will actually read
096     * the document from the supplied input stream.
097     * <p>You may parse a document more than once, but only one thread
098     * may call this method for an object at one time.
099     * @param systemId The base URI of the document, or null if not
100     *                 known.
101     * @param publicId The public identifier of the document, or null
102     *                 if not known.
103     * @param stream A byte input stream.
104     * @param encoding The suggested encoding, or null if unknown.
105     * @exception java.lang.Exception Any exception thrown by your
106     *            own handlers, or any derivation of java.io.IOException
107     *            thrown by the parser itself.
108     */
109    public void parse(String systemId, String publicId, InputStream stream,
110            String encoding) throws java.lang.Exception {
111        doParse(systemId, publicId, null, stream, encoding);
112    }
113
114    /**
115     * Parse an XML document from a character stream.
116     * <p>The URI that you supply will become the base URI for
117     * resolving relative links, but &AElig;lfred will actually read
118     * the document from the supplied input stream.
119     * <p>You may parse a document more than once, but only one thread
120     * may call this method for an object at one time.
121     * @param systemId The base URI of the document, or null if not
122     *                 known.
123     * @param publicId The public identifier of the document, or null
124     *                 if not known.
125     * @param reader A character stream.
126     * @exception java.lang.Exception Any exception thrown by your
127     *            own handlers, or any derivation of java.io.IOException
128     *            thrown by the parser itself.
129     */
130    public void parse(String systemId, String publicId, Reader reader)
131            throws java.lang.Exception {
132        doParse(systemId, publicId, reader, null, null);
133    }
134
135    private synchronized void doParse(String systemId, String publicId,
136            Reader reader, InputStream stream, String encoding)
137            throws java.lang.Exception {
138        basePublicId = publicId;
139        baseURI = systemId;
140        baseReader = reader;
141        baseInputStream = stream;
142
143        initializeVariables();
144
145        // Set the default entities here.
146        setInternalEntity(intern("amp"), "&#38;");
147        setInternalEntity(intern("lt"), "&#60;");
148        setInternalEntity(intern("gt"), "&#62;");
149        setInternalEntity(intern("apos"), "&#39;");
150        setInternalEntity(intern("quot"), "&#34;");
151
152        if (handler != null) {
153            handler.startDocument();
154        }
155
156        pushURL("[document]", basePublicId, baseURI, baseReader,
157                baseInputStream, encoding);
158
159        parseDocument();
160
161        if (handler != null) {
162            handler.endDocument();
163        }
164
165        cleanupVariables();
166    }
167
168    ////////////////////////////////////////////////////////////////////////
169    // Constants.
170    ////////////////////////////////////////////////////////////////////////
171    //
172    // Constants for element content type.
173    //
174
175    /**
176     * Constant: an element has not been declared.
177     * @see #getElementContentType
178     */
179    public final static int CONTENT_UNDECLARED = 0;
180
181    /**
182     * Constant: the element has a content model of ANY.
183     * @see #getElementContentType
184     */
185    public final static int CONTENT_ANY = 1;
186
187    /**
188     * Constant: the element has declared content of EMPTY.
189     * @see #getElementContentType
190     */
191    public final static int CONTENT_EMPTY = 2;
192
193    /**
194     * Constant: the element has mixed content.
195     * @see #getElementContentType
196     */
197    public final static int CONTENT_MIXED = 3;
198
199    /**
200     * Constant: the element has element content.
201     * @see #getElementContentType
202     */
203    public final static int CONTENT_ELEMENTS = 4;
204
205    //
206    // Constants for the entity type.
207    //
208
209    /**
210     * Constant: the entity has not been declared.
211     * @see #getEntityType
212     */
213    public final static int ENTITY_UNDECLARED = 0;
214
215    /**
216     * Constant: the entity is internal.
217     * @see #getEntityType
218     */
219    public final static int ENTITY_INTERNAL = 1;
220
221    /**
222     * Constant: the entity is external, non-XML data.
223     * @see #getEntityType
224     */
225    public final static int ENTITY_NDATA = 2;
226
227    /**
228     * Constant: the entity is external XML data.
229     * @see #getEntityType
230     */
231    public final static int ENTITY_TEXT = 3;
232
233    //
234    // Constants for attribute type.
235    //
236
237    /**
238     * Constant: the attribute has not been declared for this element type.
239     * @see #getAttributeType
240     */
241    public final static int ATTRIBUTE_UNDECLARED = 0;
242
243    /**
244     * Constant: the attribute value is a string value.
245     * @see #getAttributeType
246     */
247    public final static int ATTRIBUTE_CDATA = 1;
248
249    /**
250     * Constant: the attribute value is a unique identifier.
251     * @see #getAttributeType
252     */
253    public final static int ATTRIBUTE_ID = 2;
254
255    /**
256     * Constant: the attribute value is a reference to a unique identifier.
257     * @see #getAttributeType
258     */
259    public final static int ATTRIBUTE_IDREF = 3;
260
261    /**
262     * Constant: the attribute value is a list of ID references.
263     * @see #getAttributeType
264     */
265    public final static int ATTRIBUTE_IDREFS = 4;
266
267    /**
268     * Constant: the attribute value is the name of an entity.
269     * @see #getAttributeType
270     */
271    public final static int ATTRIBUTE_ENTITY = 5;
272
273    /**
274     * Constant: the attribute value is a list of entity names.
275     * @see #getAttributeType
276     */
277    public final static int ATTRIBUTE_ENTITIES = 6;
278
279    /**
280     * Constant: the attribute value is a name token.
281     * @see #getAttributeType
282     */
283    public final static int ATTRIBUTE_NMTOKEN = 7;
284
285    /**
286     * Constant: the attribute value is a list of name tokens.
287     * @see #getAttributeType
288     */
289    public final static int ATTRIBUTE_NMTOKENS = 8;
290
291    /**
292     * Constant: the attribute value is a token from an enumeration.
293     * @see #getAttributeType
294     */
295    public final static int ATTRIBUTE_ENUMERATED = 9;
296
297    /**
298     * Constant: the attribute is the name of a notation.
299     * @see #getAttributeType
300     */
301    public final static int ATTRIBUTE_NOTATION = 10;
302
303    //
304    // When the class is loaded, populate the hash table of
305    // attribute types.
306    //
307
308    /**
309     * Hash table of attribute types.
310     */
311    private static Hashtable attributeTypeHash;
312
313    static {
314        attributeTypeHash = new Hashtable();
315        attributeTypeHash.put("CDATA", Integer.valueOf(ATTRIBUTE_CDATA));
316        attributeTypeHash.put("ID", Integer.valueOf(ATTRIBUTE_ID));
317        attributeTypeHash.put("IDREF", Integer.valueOf(ATTRIBUTE_IDREF));
318        attributeTypeHash.put("IDREFS", Integer.valueOf(ATTRIBUTE_IDREFS));
319        attributeTypeHash.put("ENTITY", Integer.valueOf(ATTRIBUTE_ENTITY));
320        attributeTypeHash.put("ENTITIES", Integer.valueOf(ATTRIBUTE_ENTITIES));
321        attributeTypeHash.put("NMTOKEN", Integer.valueOf(ATTRIBUTE_NMTOKEN));
322        attributeTypeHash.put("NMTOKENS", Integer.valueOf(ATTRIBUTE_NMTOKENS));
323        attributeTypeHash.put("NOTATION", Integer.valueOf(ATTRIBUTE_NOTATION));
324    }
325
326    //
327    // Constants for supported encodings.
328    //
329    private final static int ENCODING_UTF_8 = 1;
330
331    private final static int ENCODING_ISO_8859_1 = 2;
332
333    private final static int ENCODING_UCS_2_12 = 3;
334
335    private final static int ENCODING_UCS_2_21 = 4;
336
337    private final static int ENCODING_UCS_4_1234 = 5;
338
339    private final static int ENCODING_UCS_4_4321 = 6;
340
341    private final static int ENCODING_UCS_4_2143 = 7;
342
343    private final static int ENCODING_UCS_4_3412 = 8;
344
345    //
346    // Constants for attribute default value.
347    //
348
349    /**
350     * Constant: the attribute is not declared.
351     * @see #getAttributeDefaultValueType
352     */
353    public final static int ATTRIBUTE_DEFAULT_UNDECLARED = 0;
354
355    /**
356     * Constant: the attribute has a literal default value specified.
357     * @see #getAttributeDefaultValueType
358     * @see #getAttributeDefaultValue
359     */
360    public final static int ATTRIBUTE_DEFAULT_SPECIFIED = 1;
361
362    /**
363     * Constant: the attribute was declared #IMPLIED.
364     * @see #getAttributeDefaultValueType
365     */
366    public final static int ATTRIBUTE_DEFAULT_IMPLIED = 2;
367
368    /**
369     * Constant: the attribute was declared #REQUIRED.
370     * @see #getAttributeDefaultValueType
371     */
372    public final static int ATTRIBUTE_DEFAULT_REQUIRED = 3;
373
374    /**
375     * Constant: the attribute was declared #FIXED.
376     * @see #getAttributeDefaultValueType
377     * @see #getAttributeDefaultValue
378     */
379    public final static int ATTRIBUTE_DEFAULT_FIXED = 4;
380
381    //
382    // Constants for input.
383    //
384    private final static int INPUT_NONE = 0;
385
386    private final static int INPUT_INTERNAL = 1;
387
388    private final static int INPUT_EXTERNAL = 2;
389
390    private final static int INPUT_STREAM = 3;
391
392    private final static int INPUT_BUFFER = 4;
393
394    private final static int INPUT_READER = 5;
395
396    //
397    // Flags for reading literals.
398    //
399    private final static int LIT_CHAR_REF = 1;
400
401    private final static int LIT_ENTITY_REF = 2;
402
403    private final static int LIT_PE_REF = 4;
404
405    private final static int LIT_NORMALIZE = 8;
406
407    //
408    // Flags for parsing context.
409    //
410    private final static int CONTEXT_NONE = 0;
411
412    private final static int CONTEXT_DTD = 1;
413
414    private final static int CONTEXT_ENTITYVALUE = 2;
415
416    private final static int CONTEXT_ATTRIBUTEVALUE = 3;
417
418    //////////////////////////////////////////////////////////////////////
419    // Error reporting.
420    //////////////////////////////////////////////////////////////////////
421
422    /**
423     * Report an error.
424     * @param message The error message.
425     * @param textFound The text that caused the error (or null).
426     * @see XmlHandler#error
427     * @see #line
428     */
429    void error(String message, String textFound, String textExpected)
430            throws java.lang.Exception {
431        //errorCount++;
432
433        if (textFound != null) {
434            message = message + " (found \"" + textFound + "\")";
435        }
436
437        if (textExpected != null) {
438            message = message + " (expected \"" + textExpected + "\")";
439        }
440
441        if (handler != null) {
442            String uri = null;
443
444            if (externalEntity != null) {
445                uri = externalEntity.getURL().toString();
446            }
447
448            handler.error(message, uri, line, column);
449        }
450    }
451
452    /**
453     * Report a serious error.
454     * @param message The error message.
455     * @param textFound The text that caused the error (or null).
456     */
457    void error(String message, char textFound, String textExpected)
458            throws java.lang.Exception {
459        error(message, Character.toString(textFound), textExpected);
460    }
461
462    //////////////////////////////////////////////////////////////////////
463    // Major syntactic productions.
464    //////////////////////////////////////////////////////////////////////
465
466    /**
467     * Parse an XML document.
468     * <pre>
469     * [1] document ::= prolog element Misc*
470     * </pre>
471     * <p>This is the top-level parsing function for a single XML
472     * document.  As a minimum, a well-formed document must have
473     * a document element, and a valid document must have a prolog
474     * as well.
475     */
476    void parseDocument() throws java.lang.Exception {
477        char c;
478
479        parseProlog();
480        require('<');
481        parseElement();
482
483        try {
484            parseMisc(); //skip all white, PIs, and comments
485            c = readCh(); //if this doesn't throw an exception...
486            error("unexpected characters after document end", c, null);
487        } catch (EOFException e) {
488            return;
489        }
490    }
491
492    /**
493     * Skip a comment.
494     * <pre>
495     * [18] Comment ::= '&lt;!--' ((Char - '-') | ('-' (Char - '-')))* "--&gt;"
496     * </pre>
497     * <p>(The <code>&lt;!--</code> has already been read.)
498     */
499    void parseComment() throws java.lang.Exception {
500        skipUntil("-->");
501    }
502
503    /**
504     * Parse a processing instruction and do a call-back.
505     * <pre>
506     * [19] PI ::= '&lt;?' Name (S (Char* - (Char* '?&gt;' Char*)))? '?&gt;'
507     * </pre>
508     * <p>(The <code>&lt;?</code> has already been read.)
509     * <p>An XML processing instruction <em>must</em> begin with
510     * a Name, which is the instruction's target.
511     */
512    void parsePI() throws java.lang.Exception {
513        String name;
514
515        name = readNmtoken(true);
516
517        if (!tryRead("?>")) {
518            requireWhitespace();
519            parseUntil("?>");
520        }
521
522        if (handler != null) {
523            handler.processingInstruction(name, dataBufferToString());
524        }
525    }
526
527    /**
528     * Parse a CDATA marked section.
529     * <pre>
530     * [20] CDSect ::= CDStart CData CDEnd
531     * [21] CDStart ::= '&lt;![CDATA['
532     * [22] CData ::= (Char* - (Char* ']]&gt;' Char*))
533     * [23] CDEnd ::= ']]&gt;'
534     * </pre>
535     * <p>(The '&lt;![CDATA[' has already been read.)
536     * <p>Note that this just appends characters to the dataBuffer,
537     * without actually generating an event.
538     */
539    void parseCDSect() throws java.lang.Exception {
540        parseUntil("]]>");
541    }
542
543    /**
544     * Parse the prolog of an XML document.
545     * <pre>
546     * [24] prolog ::= XMLDecl? Misc* (Doctypedecl Misc*)?
547     * </pre>
548     * <p>There are a couple of tricks here.  First, it is necessary to
549     * declare the XML default attributes after the DTD (if present)
550     * has been read.  Second, it is not possible to expand general
551     * references in attribute value literals until after the entire
552     * DTD (if present) has been parsed.
553     * <p>We do not look for the XML declaration here, because it is
554     * handled by pushURL().
555     * @see #pushURL
556     */
557    void parseProlog() throws java.lang.Exception {
558        parseMisc();
559
560        if (tryRead("<!DOCTYPE")) {
561            parseDoctypedecl();
562            parseMisc();
563        }
564    }
565
566    /**
567     * Parse the XML declaration.
568     * <pre>
569     * [25] XMLDecl ::= '&lt;?xml' VersionInfo EncodingDecl? SDDecl? S? '?&gt;'
570     * [26] VersionInfo ::= S 'version' Eq ('"1.0"' | "'1.0'")
571     * [33] SDDecl ::= S 'standalone' Eq "'" ('yes' | 'no') "'"
572     *               | S 'standalone' Eq '"' ("yes" | "no") '"'
573     * [78] EncodingDecl ::= S 'encoding' Eq QEncoding
574     * </pre>
575     * <p>([80] to [82] are also significant.)
576     * <p>(The <code>&lt;?xml</code> and whitespace have already been read.)
577     * <p>TODO: validate value of standalone.
578     * @see #parseTextDecl
579     * @see #checkEncoding
580     */
581    void parseXMLDecl(boolean ignoreEncoding) throws java.lang.Exception {
582        String version;
583        String encodingName = null;
584
585        // String standalone = null;
586        // Read the version.
587        require("version");
588        parseEq();
589        version = readLiteral(0);
590
591        if (!version.equals("1.0")) {
592            error("unsupported XML version", version, "1.0");
593        }
594
595        // Try reading an encoding declaration.
596        skipWhitespace();
597
598        if (tryRead("encoding")) {
599            parseEq();
600            encodingName = readLiteral(0);
601            checkEncoding(encodingName, ignoreEncoding);
602        }
603
604        // Try reading a standalone declaration
605        skipWhitespace();
606
607        if (tryRead("standalone")) {
608            parseEq();
609
610            // FIXME: Why is the literal read, but the value ignored?
611            /* standalone = */readLiteral(0);
612        }
613
614        skipWhitespace();
615        require("?>");
616    }
617
618    /**
619     * Parse the Encoding PI.
620     * <pre>
621     * [78] EncodingDecl ::= S 'encoding' Eq QEncoding
622     * [79] EncodingPI ::= '&lt;?xml' S 'encoding' Eq QEncoding S? '?&gt;'
623     * [80] QEncoding ::= '"' Encoding '"' | "'" Encoding "'"
624     * [81] Encoding ::= LatinName
625     * [82] LatinName ::= [A-Za-z] ([A-Za-z0-9._] | '-')*
626     * </pre>
627     * <p>(The <code>&lt;?xml</code>' and whitespace have already been read.)
628     * @see #parseXMLDecl
629     * @see #checkEncoding
630     */
631    void parseTextDecl(boolean ignoreEncoding) throws java.lang.Exception {
632        String encodingName = null;
633
634        // Read an optional version.
635        if (tryRead("version")) {
636            String version;
637            parseEq();
638            version = readLiteral(0);
639
640            if (!version.equals("1.0")) {
641                error("unsupported XML version", version, "1.0");
642            }
643
644            requireWhitespace();
645        }
646
647        // Read the encoding.
648        require("encoding");
649        parseEq();
650        encodingName = readLiteral(0);
651        checkEncoding(encodingName, ignoreEncoding);
652
653        skipWhitespace();
654        require("?>");
655    }
656
657    /**
658     * Check that the encoding specified makes sense.
659     * <p>Compare what the author has specified in the XML declaration
660     * or encoding PI with what we have detected.
661     * <p>This is also important for distinguishing among the various
662     * 7- and 8-bit encodings, such as ISO-LATIN-1 (I cannot autodetect
663     * those).
664     * @param encodingName The name of the encoding specified by the user.
665     * @see #parseXMLDecl
666     * @see #parseTextDecl
667     */
668    void checkEncoding(String encodingName, boolean ignoreEncoding)
669            throws java.lang.Exception {
670        // FindBugs suggests using toUpperCase(Locale)
671        encodingName = encodingName.toUpperCase(Locale.getDefault());
672
673        if (ignoreEncoding) {
674            return;
675        }
676
677        switch (encoding) {
678        // 8-bit encodings
679        case ENCODING_UTF_8:
680
681            if (encodingName.equals("ISO-8859-1")) {
682                encoding = ENCODING_ISO_8859_1;
683            } else if (!encodingName.equals("UTF-8")) {
684                error("unsupported 8-bit encoding", encodingName,
685                        "UTF-8 or ISO-8859-1");
686            }
687
688            break;
689
690        // 16-bit encodings
691        case ENCODING_UCS_2_12:
692        case ENCODING_UCS_2_21:
693
694            if (!encodingName.equals("ISO-10646-UCS-2")
695                    && !encodingName.equals("UTF-16")) {
696                error("unsupported 16-bit encoding", encodingName,
697                        "ISO-10646-UCS-2");
698            }
699
700            break;
701
702        // 32-bit encodings
703        case ENCODING_UCS_4_1234:
704        case ENCODING_UCS_4_4321:
705        case ENCODING_UCS_4_2143:
706        case ENCODING_UCS_4_3412:
707
708            if (!encodingName.equals("ISO-10646-UCS-4")) {
709                error("unsupported 32-bit encoding", encodingName,
710                        "ISO-10646-UCS-4");
711            }
712        }
713    }
714
715    /**
716     * Parse miscellaneous markup outside the document element and DOCTYPE
717     * declaration.
718     * <pre>
719     * [27] Misc ::= Comment | PI | S
720     * </pre>
721     */
722    void parseMisc() throws java.lang.Exception {
723        while (true) {
724            skipWhitespace();
725
726            if (tryRead("<?")) {
727                parsePI();
728            } else if (tryRead("<!--")) {
729                parseComment();
730            } else {
731                return;
732            }
733        }
734    }
735
736    /**
737     * Parse a document type declaration.
738     * <pre>
739     * [28] doctypedecl ::= '&lt;!DOCTYPE' S Name (S ExternalID)? S?
740     *                      ('[' %markupdecl* ']' S?)? '&gt;'
741     * </pre>
742     * <p>(The <code>&lt;!DOCTYPE</code> has already been read.)
743     */
744    void parseDoctypedecl() throws java.lang.Exception {
745        String doctypeName;
746        String[] ids;
747
748        // Read the document type name.
749        requireWhitespace();
750        doctypeName = readNmtoken(true);
751
752        // Read the ExternalIDs.
753        skipWhitespace();
754        ids = readExternalIds(false);
755
756        // Look for a declaration subset.
757        skipWhitespace();
758
759        if (tryRead('[')) {
760            // loop until the subset ends
761            while (true) {
762                context = CONTEXT_DTD;
763                skipWhitespace();
764                context = CONTEXT_NONE;
765
766                if (tryRead(']')) {
767                    break; // end of subset
768                } else {
769                    context = CONTEXT_DTD;
770                    parseMarkupdecl();
771                    context = CONTEXT_NONE;
772                }
773            }
774        }
775
776        // Read the external subset, if any
777        if (ids[1] != null) {
778            pushURL("[external subset]", ids[0], ids[1], null, null, null);
779
780            // Loop until we end up back at '>'
781            while (true) {
782                context = CONTEXT_DTD;
783                skipWhitespace();
784                context = CONTEXT_NONE;
785
786                if (tryRead('>')) {
787                    break;
788                } else {
789                    context = CONTEXT_DTD;
790                    parseMarkupdecl();
791                    context = CONTEXT_NONE;
792                }
793            }
794        } else {
795            // No external subset.
796            skipWhitespace();
797            require('>');
798        }
799
800        if (handler != null) {
801            handler.doctypeDecl(doctypeName, ids[0], ids[1]);
802        }
803
804        // Expand general entities in
805        // default values of attributes.
806        // (Do this after the doctypeDecl
807        // event!).
808        // expandAttributeDefaultValues();
809    }
810
811    /**
812     * Parse a markup declaration in the internal or external DTD subset.
813     * <pre>
814     * [29] markupdecl ::= ( %elementdecl | %AttlistDecl | %EntityDecl |
815     *                       %NotationDecl | %PI | %S | %Comment |
816     *                       InternalPERef )
817     * [30] InternalPERef ::= PEReference
818     * [31] extSubset ::= (%markupdecl | %conditionalSect)*
819     * </pre>
820     */
821    void parseMarkupdecl() throws java.lang.Exception {
822        if (tryRead("<!ELEMENT")) {
823            parseElementdecl();
824        } else if (tryRead("<!ATTLIST")) {
825            parseAttlistDecl();
826        } else if (tryRead("<!ENTITY")) {
827            parseEntityDecl();
828        } else if (tryRead("<!NOTATION")) {
829            parseNotationDecl();
830        } else if (tryRead("<?")) {
831            parsePI();
832        } else if (tryRead("<!--")) {
833            parseComment();
834        } else if (tryRead("<![")) {
835            parseConditionalSect();
836        } else {
837            error("expected markup declaration", null, null);
838        }
839    }
840
841    /**
842     * Parse an element, with its tags.
843     * <pre>
844     * [33] STag ::= '&lt;' Name (S Attribute)* S? '&gt;' [WFC: unique Att spec]
845     * [38] element ::= EmptyElement | STag content ETag
846     * [39] EmptyElement ::= '&lt;' Name (S Attribute)* S? '/&gt;'
847     *                       [WFC: unique Att spec]
848     * </pre>
849     * <p>(The '&lt;' has already been read.)
850     * <p>NOTE: this method actually chains onto parseContent(), if necessary,
851     * and parseContent() will take care of calling parseETag().
852     */
853    void parseElement() throws java.lang.Exception {
854        String gi;
855        char c;
856        int oldElementContent = currentElementContent;
857        String oldElement = currentElement;
858
859        // This is the (global) counter for the
860        // array of specified attributes.
861        tagAttributePos = 0;
862
863        // Read the element type name.
864        gi = readNmtoken(true);
865
866        // Determine the current content type.
867        currentElement = gi;
868        currentElementContent = getElementContentType(gi);
869
870        if (currentElementContent == CONTENT_UNDECLARED) {
871            currentElementContent = CONTENT_ANY;
872        }
873
874        // Read the attributes, if any.
875        // After this loop, we should be just
876        // in front of the closing delimiter.
877        skipWhitespace();
878        c = readCh();
879
880        while (c != '/' && c != '>') {
881            unread(c);
882            parseAttribute(gi);
883            skipWhitespace();
884            c = readCh();
885        }
886
887        unread(c);
888
889        // Supply any defaulted attributes.
890        Enumeration atts = declaredAttributes(gi);
891
892        if (atts != null) {
893            String aname;
894            loop: while (atts.hasMoreElements()) {
895                aname = (String) atts.nextElement();
896
897                // See if it was specified.
898                for (int i = 0; i < tagAttributePos; i++) {
899                    if (tagAttributes[i].equals(aname)) {
900                        continue loop;
901                    }
902                }
903
904                // I guess not...
905                if (handler != null) {
906                    handler.attribute(aname,
907                            getAttributeExpandedValue(gi, aname), false);
908                }
909            }
910        }
911
912        // Figure out if this is a start tag
913        // or an empty element, and dispatch an
914        // event accordingly.
915        c = readCh();
916
917        switch (c) {
918        case '>':
919
920            if (handler != null) {
921                handler.startElement(gi);
922            }
923
924            parseContent();
925            break;
926
927        case '/':
928            require('>');
929
930            if (handler != null) {
931                handler.startElement(gi);
932                handler.endElement(gi);
933            }
934
935            break;
936        }
937
938        // Restore the previous state.
939        currentElement = oldElement;
940        currentElementContent = oldElementContent;
941    }
942
943    /**
944     * Parse an attribute assignment.
945     * <pre>
946     * [34] Attribute ::= Name Eq AttValue
947     * </pre>
948     * @param name The name of the attribute's element.
949     * @see XmlHandler#attribute
950     */
951    void parseAttribute(String name) throws java.lang.Exception {
952        String aname;
953        int type;
954        String value;
955
956        // Read the attribute name.
957        aname = readNmtoken(true).intern();
958
959        // Fix by Zoltan Kemenczy for:
960        // "attribute value normalization according to Section 3.3.3
961        // Attribute-Value Normalization of XML 1.0
962        // http://www.w3.org/TR/2000/REC-xml-20001006#AVNormalize). It
963        // says that escaped whitespace character references that are not
964        // #x20 (like the newline,#xa) should be preserved in the
965        // normalized value)"
966        //type = getAttributeDefaultValueType(name, aname);
967        type = getAttributeType(name, aname);
968
969        // Parse '='
970        parseEq();
971
972        // Read the value, normalizing whitespace
973        // if it is not CDATA.
974        if (type == ATTRIBUTE_CDATA || type == ATTRIBUTE_UNDECLARED) {
975            value = readLiteral(LIT_CHAR_REF | LIT_ENTITY_REF);
976        } else {
977            value = readLiteral(LIT_CHAR_REF | LIT_ENTITY_REF | LIT_NORMALIZE);
978        }
979
980        // Inform the handler about the
981        // attribute.
982        if (handler != null) {
983            handler.attribute(aname, value, true);
984        }
985
986        dataBufferPos = 0;
987
988        // Note that the attribute has been
989        // specified.
990        if (tagAttributePos == tagAttributes.length) {
991            String[] newAttrib = new String[tagAttributes.length * 2];
992            System.arraycopy(tagAttributes, 0, newAttrib, 0, tagAttributePos);
993            tagAttributes = newAttrib;
994        }
995
996        tagAttributes[tagAttributePos++] = aname;
997    }
998
999    /**
1000     * Parse an equals sign surrounded by optional whitespace.
1001     * [35] Eq ::= S? '=' S?
1002     */
1003    void parseEq() throws java.lang.Exception {
1004        skipWhitespace();
1005        require('=');
1006        skipWhitespace();
1007    }
1008
1009    /**
1010     * Parse an end tag.
1011     * [36] ETag ::= '&lt;/' Name S? '&gt;'
1012     * *NOTE: parseContent() chains to here.
1013     */
1014    void parseETag() throws java.lang.Exception {
1015        String name;
1016        name = readNmtoken(true);
1017
1018        if (!name.equals(currentElement)) {
1019            error("mismatched end tag", name, currentElement);
1020        }
1021
1022        skipWhitespace();
1023        require('>');
1024
1025        if (handler != null) {
1026            handler.endElement(name);
1027        }
1028    }
1029
1030    /**
1031     * Parse the content of an element.
1032     * [37] content ::= (element | PCData | Reference | CDSect | PI | Comment)*
1033     * [68] Reference ::= EntityRef | CharRef
1034     */
1035    void parseContent() throws java.lang.Exception {
1036        char c;
1037
1038        while (true) {
1039            switch (currentElementContent) {
1040            case CONTENT_ANY:
1041            case CONTENT_MIXED:
1042                parsePCData();
1043                break;
1044
1045            case CONTENT_ELEMENTS:
1046                parseWhitespace();
1047                break;
1048            }
1049
1050            // Handle delimiters
1051            c = readCh();
1052
1053            switch (c) {
1054            case '&': // Found "&"
1055                c = readCh();
1056
1057                if (c == '#') {
1058                    parseCharRef();
1059                } else {
1060                    unread(c);
1061                    parseEntityRef(true);
1062                }
1063
1064                break;
1065
1066            case '<': // Found "<"
1067                c = readCh();
1068
1069                switch (c) {
1070                case '!': // Found "<!"
1071                    c = readCh();
1072
1073                    switch (c) {
1074                    case '-': // Found "<!-"
1075                        require('-');
1076                        parseComment();
1077                        break;
1078
1079                    case '[': // Found "<!["
1080                        require("CDATA[");
1081                        parseCDSect();
1082                        break;
1083
1084                    default:
1085                        error("expected comment or CDATA section", c, null);
1086                        break;
1087                    }
1088
1089                    break;
1090
1091                case '?': // Found "<?"
1092                    dataBufferFlush();
1093                    parsePI();
1094                    break;
1095
1096                case '/': // Found "</"
1097                    dataBufferFlush();
1098                    parseETag();
1099                    return;
1100
1101                default: // Found "<" followed by something else
1102                    dataBufferFlush();
1103                    unread(c);
1104                    parseElement();
1105                    break;
1106                }
1107            }
1108        }
1109    }
1110
1111    /**
1112     * Parse an element type declaration.
1113     * [40] elementdecl ::= '&lt;!ELEMENT' S %Name S (%S S)? %contentspec S? '&gt;'
1114     *                      [VC: Unique Element Declaration]
1115     * *NOTE: the '&lt;!ELEMENT' has already been read.
1116     */
1117    void parseElementdecl() throws java.lang.Exception {
1118        String name;
1119
1120        requireWhitespace();
1121
1122        // Read the element type name.
1123        name = readNmtoken(true);
1124
1125        requireWhitespace();
1126
1127        // Read the content model.
1128        parseContentspec(name);
1129
1130        skipWhitespace();
1131        require('>');
1132    }
1133
1134    /**
1135     * Content specification.
1136     * [41] contentspec ::= 'EMPTY' | 'ANY' | Mixed | elements
1137     */
1138    void parseContentspec(String name) throws java.lang.Exception {
1139        if (tryRead("EMPTY")) {
1140            setElement(name, CONTENT_EMPTY, null, null);
1141            return;
1142        } else if (tryRead("ANY")) {
1143            setElement(name, CONTENT_ANY, null, null);
1144            return;
1145        } else {
1146            require('(');
1147            dataBufferAppend('(');
1148            skipWhitespace();
1149
1150            if (tryRead("#PCDATA")) {
1151                dataBufferAppend("#PCDATA");
1152                parseMixed();
1153                setElement(name, CONTENT_MIXED, dataBufferToString(), null);
1154            } else {
1155                parseElements();
1156                setElement(name, CONTENT_ELEMENTS, dataBufferToString(), null);
1157            }
1158        }
1159    }
1160
1161    /**
1162     * Parse an element-content model.
1163     * [42] elements ::= (choice | seq) ('?' | '*' | '+')?
1164     * [44] cps ::= S? %cp S?
1165     * [45] choice ::= '(' S? %ctokplus (S? '|' S? %ctoks)* S? ')'
1166     * [46] ctokplus ::= cps ('|' cps)+
1167     * [47] ctoks ::= cps ('|' cps)*
1168     * [48] seq ::= '(' S? %stoks (S? ',' S? %stoks)* S? ')'
1169     * [49] stoks ::= cps (',' cps)*
1170     * *NOTE: the opening '(' and S have already been read.
1171     * *TODO: go over parameter entity boundaries more carefully.
1172     */
1173    void parseElements() throws java.lang.Exception {
1174        char c;
1175        char sep;
1176
1177        // Parse the first content particle
1178        skipWhitespace();
1179        parseCp();
1180
1181        // Check for end or for a separator.
1182        skipWhitespace();
1183        c = readCh();
1184
1185        switch (c) {
1186        case ')':
1187            dataBufferAppend(')');
1188            c = readCh();
1189
1190            switch (c) {
1191            case '*':
1192            case '+':
1193            case '?':
1194                dataBufferAppend(c);
1195                break;
1196
1197            default:
1198                unread(c);
1199            }
1200
1201            return;
1202
1203        case ',': // Register the separator.
1204        case '|':
1205            sep = c;
1206            dataBufferAppend(c);
1207            break;
1208
1209        default:
1210            error("bad separator in content model", c, null);
1211            return;
1212        }
1213
1214        // Parse the rest of the content model.
1215        while (true) {
1216            skipWhitespace();
1217            parseCp();
1218            skipWhitespace();
1219            c = readCh();
1220
1221            if (c == ')') {
1222                dataBufferAppend(')');
1223                break;
1224            } else if (c != sep) {
1225                error("bad separator in content model", c, "'" + sep + "'");
1226                return;
1227            } else {
1228                dataBufferAppend(c);
1229            }
1230        }
1231
1232        // Check for the occurrence indicator.
1233        c = readCh();
1234
1235        switch (c) {
1236        case '?':
1237        case '*':
1238        case '+':
1239            dataBufferAppend(c);
1240            return;
1241
1242        default:
1243            unread(c);
1244            return;
1245        }
1246    }
1247
1248    /**
1249     * Parse a content particle.
1250     * [43] cp ::= (Name | choice | seq) ('?' | '*' | '+')
1251     * *NOTE: I actually use a slightly different production here:
1252     *        cp ::= (elements | (Name ('?' | '*' | '+')?))
1253     */
1254    void parseCp() throws java.lang.Exception {
1255        char c;
1256
1257        if (tryRead('(')) {
1258            dataBufferAppend('(');
1259            parseElements();
1260        } else {
1261            dataBufferAppend(readNmtoken(true));
1262            c = readCh();
1263
1264            switch (c) {
1265            case '?':
1266            case '*':
1267            case '+':
1268                dataBufferAppend(c);
1269                break;
1270
1271            default:
1272                unread(c);
1273                break;
1274            }
1275        }
1276    }
1277
1278    /**
1279     * Parse mixed content.
1280     * [50] Mixed ::= '(' S? %( %'#PCDATA' (S? '|' S? %Mtoks)* ) S? ')*'
1281     *              | '(' S? %('#PCDATA') S? ')'
1282     * [51] Mtoks ::= %Name (S? '|' S? %Name)*
1283     * *NOTE: the S and '#PCDATA' have already been read.
1284     */
1285    void parseMixed() throws java.lang.Exception {
1286        // Check for PCDATA alone.
1287        skipWhitespace();
1288
1289        if (tryRead(')')) {
1290            dataBufferAppend(")*");
1291            tryRead('*');
1292            return;
1293        }
1294
1295        // Parse mixed content.
1296        skipWhitespace();
1297
1298        while (!tryRead(")*")) {
1299            require('|');
1300            dataBufferAppend('|');
1301            skipWhitespace();
1302            dataBufferAppend(readNmtoken(true));
1303            skipWhitespace();
1304        }
1305
1306        dataBufferAppend(")*");
1307    }
1308
1309    /**
1310     * Parse an attribute list declaration.
1311     * [52] AttlistDecl ::= '&lt;!ATTLIST' S %Name S? %AttDef+ S? '&gt;'
1312     * *NOTE: the '&lt;!ATTLIST' has already been read.
1313     */
1314    void parseAttlistDecl() throws java.lang.Exception {
1315        String elementName;
1316
1317        requireWhitespace();
1318        elementName = readNmtoken(true);
1319        requireWhitespace();
1320
1321        while (!tryRead('>')) {
1322            parseAttDef(elementName);
1323            skipWhitespace();
1324        }
1325    }
1326
1327    /**
1328     * Parse a single attribute definition.
1329     * [53] AttDef ::= S %Name S %AttType S %Default
1330     */
1331    void parseAttDef(String elementName) throws java.lang.Exception {
1332        String name;
1333        int type;
1334        String enumeration = null;
1335
1336        // Read the attribute name.
1337        name = readNmtoken(true);
1338
1339        // Read the attribute type.
1340        requireWhitespace();
1341        type = readAttType();
1342
1343        // Get the string of enumerated values
1344        // if necessary.
1345        if (type == ATTRIBUTE_ENUMERATED || type == ATTRIBUTE_NOTATION) {
1346            enumeration = dataBufferToString();
1347        }
1348
1349        // Read the default value.
1350        requireWhitespace();
1351        parseDefault(elementName, name, type, enumeration);
1352    }
1353
1354    /**
1355     * Parse the attribute type.
1356     * [54] AttType ::= StringType | TokenizedType | EnumeratedType
1357     * [55] StringType ::= 'CDATA'
1358     * [56] TokenizedType ::= 'ID' | 'IDREF' | 'IDREFS' | 'ENTITY' | 'ENTITIES' |
1359     *                        'NMTOKEN' | 'NMTOKENS'
1360     * [57] EnumeratedType ::= NotationType | Enumeration
1361     * *TODO: validate the type!!
1362     */
1363    int readAttType() throws java.lang.Exception {
1364        String typeString;
1365        Integer type;
1366
1367        if (tryRead('(')) {
1368            parseEnumeration();
1369            return ATTRIBUTE_ENUMERATED;
1370        } else {
1371            typeString = readNmtoken(true);
1372
1373            if (typeString.equals("NOTATION")) {
1374                parseNotationType();
1375            }
1376
1377            type = (Integer) attributeTypeHash.get(typeString);
1378
1379            if (type == null) {
1380                error("illegal attribute type", typeString, null);
1381                return ATTRIBUTE_UNDECLARED;
1382            } else {
1383                return type.intValue();
1384            }
1385        }
1386    }
1387
1388    /**
1389     * Parse an enumeration.
1390     * [60] Enumeration ::= '(' S? %Etoks (S? '|' S? %Etoks)* S? ')'
1391     * [61] Etoks ::= %Nmtoken (S? '|' S? %Nmtoken)*
1392     * *NOTE: the '(' has already been read.
1393     */
1394    void parseEnumeration() throws java.lang.Exception {
1395        dataBufferAppend('(');
1396
1397        // Read the first token.
1398        skipWhitespace();
1399        dataBufferAppend(readNmtoken(true));
1400
1401        // Read the remaining tokens.
1402        skipWhitespace();
1403
1404        while (!tryRead(')')) {
1405            require('|');
1406            dataBufferAppend('|');
1407            skipWhitespace();
1408            dataBufferAppend(readNmtoken(true));
1409            skipWhitespace();
1410        }
1411
1412        dataBufferAppend(')');
1413    }
1414
1415    /**
1416     * Parse a notation type for an attribute.
1417     * [58] NotationType ::= %'NOTATION' S '(' S? %Ntoks (S? '|' S? %Ntoks)*
1418     *                       S? ')'
1419     * [59] Ntoks ::= %Name (S? '|' S? %Name)
1420     * *NOTE: the 'NOTATION' has already been read
1421     */
1422    void parseNotationType() throws java.lang.Exception {
1423        requireWhitespace();
1424        require('(');
1425
1426        parseEnumeration();
1427    }
1428
1429    /**
1430     * Parse the default value for an attribute.
1431     * [62] Default ::= '#REQUIRED' | '#IMPLIED' | ((%'#FIXED' S)? %AttValue
1432     */
1433    void parseDefault(String elementName, String name, int type,
1434            String enumeration) throws java.lang.Exception {
1435        int valueType = ATTRIBUTE_DEFAULT_SPECIFIED;
1436        String value = null;
1437
1438        if (tryRead('#')) {
1439            if (tryRead("FIXED")) {
1440                valueType = ATTRIBUTE_DEFAULT_FIXED;
1441                requireWhitespace();
1442                context = CONTEXT_ATTRIBUTEVALUE;
1443                value = readLiteral(LIT_CHAR_REF);
1444                context = CONTEXT_DTD;
1445            } else if (tryRead("REQUIRED")) {
1446                valueType = ATTRIBUTE_DEFAULT_REQUIRED;
1447            } else if (tryRead("IMPLIED")) {
1448                valueType = ATTRIBUTE_DEFAULT_IMPLIED;
1449            } else {
1450                error("illegal keyword for attribute default value", null,
1451                        null);
1452            }
1453        } else {
1454            context = CONTEXT_ATTRIBUTEVALUE;
1455            value = readLiteral(LIT_CHAR_REF);
1456            context = CONTEXT_DTD;
1457        }
1458
1459        setAttribute(elementName, name, type, enumeration, value, valueType);
1460    }
1461
1462    /**
1463     * Parse a conditional section.
1464     * [63] conditionalSect ::= includeSect || ignoreSect
1465     * [64] includeSect ::= '&lt;![' %'INCLUDE' '[' (%markupdecl*)* ']]&gt;'
1466     * [65] ignoreSect ::= '&lt;![' %'IGNORE' '[' ignoreSectContents* ']]&gt;'
1467     * [66] ignoreSectContents ::= ((SkipLit | Comment | PI) -(Char* ']]&gt;'))
1468     *                           | ('&lt;![' ignoreSectContents* ']]&gt;')
1469     *                           | (Char - (']' | [&lt;'"]))
1470     *                           | ('&lt;!' (Char - ('-' | '[')))
1471     * *NOTE: the '&lt;![' has already been read.
1472     * *TODO: verify that I am handling ignoreSectContents right.
1473     */
1474    void parseConditionalSect() throws java.lang.Exception {
1475        skipWhitespace();
1476
1477        if (tryRead("INCLUDE")) {
1478            skipWhitespace();
1479            require('[');
1480            skipWhitespace();
1481
1482            while (!tryRead("]]>")) {
1483                parseMarkupdecl();
1484                skipWhitespace();
1485            }
1486        } else if (tryRead("IGNORE")) {
1487            skipWhitespace();
1488            require('[');
1489
1490            char c;
1491
1492            for (int nest = 1; nest > 0;) {
1493                c = readCh();
1494
1495                switch (c) {
1496                case '<':
1497
1498                    if (tryRead("![")) {
1499                        nest++;
1500                    }
1501
1502                    break;
1503
1504                case ']':
1505
1506                    if (tryRead("]>")) {
1507                        nest--;
1508                    }
1509
1510                    break;
1511                }
1512            }
1513        } else {
1514            error("conditional section must begin with INCLUDE or IGNORE", null,
1515                    null);
1516        }
1517    }
1518
1519    /**
1520     * Read a character reference.
1521     * [67] CharRef ::= '&amp;#' [0-9]+ ';' | '&amp;#x' [0-9a-fA-F]+ ';'
1522     * *NOTE: the '&amp;#' has already been read.
1523     */
1524    void parseCharRef() throws java.lang.Exception {
1525        int value = 0;
1526        char c;
1527
1528        if (tryRead('x')) {
1529            loop1: while (true) {
1530                c = readCh();
1531
1532                switch (c) {
1533                case '0':
1534                case '1':
1535                case '2':
1536                case '3':
1537                case '4':
1538                case '5':
1539                case '6':
1540                case '7':
1541                case '8':
1542                case '9':
1543                case 'a':
1544                case 'A':
1545                case 'b':
1546                case 'B':
1547                case 'c':
1548                case 'C':
1549                case 'd':
1550                case 'D':
1551                case 'e':
1552                case 'E':
1553                case 'f':
1554                case 'F':
1555                    value *= 16;
1556                    value += Integer.parseInt(Character.toString(c), 16);
1557                    break;
1558
1559                case ';':
1560                    break loop1;
1561
1562                default:
1563                    error("illegal character in character reference", c, null);
1564                    break loop1;
1565                }
1566            }
1567        } else {
1568            loop2: while (true) {
1569                c = readCh();
1570
1571                switch (c) {
1572                case '0':
1573                case '1':
1574                case '2':
1575                case '3':
1576                case '4':
1577                case '5':
1578                case '6':
1579                case '7':
1580                case '8':
1581                case '9':
1582                    value *= 10;
1583                    value += Integer.parseInt(Character.toString(c), 10);
1584                    break;
1585
1586                case ';':
1587                    break loop2;
1588
1589                default:
1590                    error("illegal character in character reference", c, null);
1591                    break loop2;
1592                }
1593            }
1594        }
1595
1596        // Check for surrogates: 00000000 0000xxxx yyyyyyyy zzzzzzzz
1597        //  (1101|10xx|xxyy|yyyy + 1101|11yy|zzzz|zzzz:
1598        if (value <= 0x0000ffff) {
1599            // no surrogates needed
1600            dataBufferAppend((char) value);
1601        } else if (value <= 0x000fffff) {
1602            // > 16 bits, surrogate needed
1603            dataBufferAppend((char) (0xd8 | (value & 0x000ffc00) >> 10));
1604            dataBufferAppend((char) (0xdc | value & 0x0003ff));
1605        } else {
1606            // too big for surrogate
1607            error("character reference " + value + " is too large for UTF-16",
1608                    Integer.valueOf(value).toString(), null);
1609        }
1610    }
1611
1612    /**
1613     * Parse a reference.
1614     * [69] EntityRef ::= '&amp;' Name ';'
1615     * *NOTE: the '&amp;' has already been read.
1616     * @param externalAllowed External entities are allowed here.
1617     */
1618    void parseEntityRef(boolean externalAllowed) throws java.lang.Exception {
1619        String name;
1620
1621        name = readNmtoken(true);
1622        require(';');
1623
1624        switch (getEntityType(name)) {
1625        case ENTITY_UNDECLARED:
1626            error("reference to undeclared entity", name, null);
1627            break;
1628
1629        case ENTITY_INTERNAL:
1630            pushString(name, getEntityValue(name));
1631            break;
1632
1633        case ENTITY_TEXT:
1634
1635            if (externalAllowed) {
1636                pushURL(name, getEntityPublicId(name), getEntitySystemId(name),
1637                        null, null, null);
1638            } else {
1639                error("reference to external entity in attribute value.", name,
1640                        null);
1641            }
1642
1643            break;
1644
1645        case ENTITY_NDATA:
1646
1647            if (externalAllowed) {
1648                error("data entity reference in content", name, null);
1649            } else {
1650                error("reference to external entity in attribute value.", name,
1651                        null);
1652            }
1653
1654            break;
1655        }
1656    }
1657
1658    /**
1659     * Parse a parameter entity reference.
1660     * [70] PEReference ::= '%' Name ';'
1661     * *NOTE: the '%' has already been read.
1662     */
1663    void parsePEReference(boolean isEntityValue) throws java.lang.Exception {
1664        String name;
1665
1666        name = "%" + readNmtoken(true);
1667        require(';');
1668
1669        switch (getEntityType(name)) {
1670        case ENTITY_UNDECLARED:
1671            error("reference to undeclared parameter entity", name, null);
1672            break;
1673
1674        case ENTITY_INTERNAL:
1675
1676            if (isEntityValue) {
1677                pushString(name, getEntityValue(name));
1678            } else {
1679                pushString(name, " " + getEntityValue(name) + ' ');
1680            }
1681
1682            break;
1683
1684        case ENTITY_TEXT:
1685
1686            if (isEntityValue) {
1687                pushString(null, " ");
1688            }
1689
1690            pushURL(name, getEntityPublicId(name), getEntitySystemId(name),
1691                    null, null, null);
1692
1693            if (isEntityValue) {
1694                pushString(null, " ");
1695            }
1696
1697            break;
1698        }
1699    }
1700
1701    /**
1702     * Parse an entity declaration.
1703     * [71] EntityDecl ::= '&lt;!ENTITY' S %Name S %EntityDef S? '&gt;'
1704     *                   | '&lt;!ENTITY' S '%' S %Name S %EntityDef S? '&gt;'
1705     * [72] EntityDef ::= EntityValue | ExternalDef
1706     * [73] ExternalDef ::= ExternalID %NDataDecl?
1707     * [74] ExternalID ::= 'SYSTEM' S SystemLiteral
1708     *                   | 'PUBLIC' S PubidLiteral S SystemLiteral
1709     * [75] NDataDecl ::= S %'NDATA' S %Name
1710     * *NOTE: the '&lt;!ENTITY' has already been read.
1711     */
1712    void parseEntityDecl() throws java.lang.Exception {
1713        char c;
1714        boolean peFlag = false;
1715        String name;
1716        String value;
1717        String notationName;
1718        String[] ids;
1719
1720        // Check for a parameter entity.
1721        requireWhitespace();
1722
1723        if (tryRead('%')) {
1724            peFlag = true;
1725            requireWhitespace();
1726        }
1727
1728        // Read the entity name, and prepend
1729        // '%' if necessary.
1730        name = readNmtoken(true);
1731
1732        if (peFlag) {
1733            name = "%" + name;
1734        }
1735
1736        // Read the entity value.
1737        requireWhitespace();
1738        c = readCh();
1739        unread(c);
1740
1741        if (c == '"' || c == '\'') {
1742            // Internal entity.
1743            context = CONTEXT_ENTITYVALUE;
1744            value = readLiteral(LIT_CHAR_REF | LIT_PE_REF);
1745            context = CONTEXT_DTD;
1746            setInternalEntity(name, value);
1747        } else {
1748            // Read the external IDs
1749            ids = readExternalIds(false);
1750
1751            if (ids[1] == null) {
1752                error("system identifier missing", name, null);
1753            }
1754
1755            // Check for NDATA declaration.
1756            skipWhitespace();
1757
1758            if (tryRead("NDATA")) {
1759                requireWhitespace();
1760                notationName = readNmtoken(true);
1761                setExternalDataEntity(name, ids[0], ids[1], notationName);
1762            } else {
1763                setExternalTextEntity(name, ids[0], ids[1]);
1764            }
1765        }
1766
1767        // Finish the declaration.
1768        skipWhitespace();
1769        require('>');
1770    }
1771
1772    /**
1773     * Parse a notation declaration.
1774     * [81] NotationDecl ::= '&lt;!NOTATION' S %Name S %ExternalID S? '&gt;'
1775     * *NOTE: the '&lt;!NOTATION' has already been read.
1776     */
1777    void parseNotationDecl() throws java.lang.Exception {
1778        String nname;
1779        String[] ids;
1780
1781        requireWhitespace();
1782        nname = readNmtoken(true);
1783
1784        requireWhitespace();
1785
1786        // Read the external identifiers.
1787        ids = readExternalIds(true);
1788
1789        if (ids[0] == null && ids[1] == null) {
1790            error("external identifier missing", nname, null);
1791        }
1792
1793        // Register the notation.
1794        setNotation(nname, ids[0], ids[1]);
1795
1796        skipWhitespace();
1797        require('>');
1798    }
1799
1800    /**
1801     * Parse PCDATA.
1802     * <pre>
1803     * [16] PCData ::= [^&lt;&amp;]*
1804     * </pre>
1805     * <p>The trick here is that the data stays in the dataBuffer without
1806     * necessarily being converted to a string right away.
1807     */
1808    void parsePCData() throws java.lang.Exception {
1809        char c;
1810
1811        // Start with a little cheat -- in most
1812        // cases, the entire sequence of
1813        // character data will already be in
1814        // the readBuffer; if not, fall through to
1815        // the normal approach.
1816        if (USE_CHEATS) {
1817            int lineAugment = 0;
1818            int columnAugment = 0;
1819
1820            /*loop:*/for (int i = readBufferPos; i < readBufferLength; i++) {
1821                switch (readBuffer[i]) {
1822                case '\n':
1823                    lineAugment++;
1824                    columnAugment = 0;
1825                    break;
1826
1827                case '&':
1828                case '<':
1829
1830                    int start = readBufferPos;
1831                    columnAugment++;
1832                    readBufferPos = i;
1833
1834                    if (lineAugment > 0) {
1835                        line += lineAugment;
1836                        column = columnAugment;
1837                    } else {
1838                        column += columnAugment;
1839                    }
1840
1841                    dataBufferAppend(readBuffer, start, i - start);
1842                    return;
1843
1844                default:
1845                    columnAugment++;
1846                }
1847            }
1848        }
1849
1850        // OK, the cheat didn't work; start over
1851        // and do it by the book.
1852        while (true) {
1853            c = readCh();
1854
1855            switch (c) {
1856            case '<':
1857            case '&':
1858                unread(c);
1859                return;
1860
1861            default:
1862                dataBufferAppend(c);
1863                break;
1864            }
1865        }
1866    }
1867
1868    //////////////////////////////////////////////////////////////////////
1869    // High-level reading and scanning methods.
1870    //////////////////////////////////////////////////////////////////////
1871
1872    /**
1873     * Require whitespace characters.
1874     * [1] S ::= (#x20 | #x9 | #xd | #xa)+
1875     */
1876    void requireWhitespace() throws java.lang.Exception {
1877        char c = readCh();
1878
1879        if (isWhitespace(c)) {
1880            skipWhitespace();
1881        } else {
1882            error("whitespace expected", c, null);
1883        }
1884    }
1885
1886    /**
1887     * Parse whitespace characters, and leave them in the data buffer.
1888     */
1889    void parseWhitespace() throws java.lang.Exception {
1890        char c = readCh();
1891
1892        while (isWhitespace(c)) {
1893            dataBufferAppend(c);
1894            c = readCh();
1895        }
1896
1897        unread(c);
1898    }
1899
1900    /**
1901     * Skip whitespace characters.
1902     * [1] S ::= (#x20 | #x9 | #xd | #xa)+
1903     */
1904    void skipWhitespace() throws java.lang.Exception {
1905        // Start with a little cheat.  Most of
1906        // the time, the white space will fall
1907        // within the current read buffer; if
1908        // not, then fall through.
1909        if (USE_CHEATS) {
1910            int lineAugment = 0;
1911            int columnAugment = 0;
1912
1913            loop: for (int i = readBufferPos; i < readBufferLength; i++) {
1914                switch (readBuffer[i]) {
1915                case ' ':
1916                case '\t':
1917                case '\r':
1918                    columnAugment++;
1919                    break;
1920
1921                case '\n':
1922                    lineAugment++;
1923                    columnAugment = 0;
1924                    break;
1925
1926                case '%':
1927
1928                    if (context == CONTEXT_DTD
1929                            || context == CONTEXT_ENTITYVALUE) {
1930                        break loop;
1931                    } // else fall through...
1932
1933                default:
1934                    readBufferPos = i;
1935
1936                    if (lineAugment > 0) {
1937                        line += lineAugment;
1938                        column = columnAugment;
1939                    } else {
1940                        column += columnAugment;
1941                    }
1942
1943                    return;
1944                }
1945            }
1946        }
1947
1948        // OK, do it by the book.
1949        char c = readCh();
1950
1951        while (isWhitespace(c)) {
1952            c = readCh();
1953        }
1954
1955        unread(c);
1956    }
1957
1958    /**
1959     * Read a name or name token.
1960     * [5] Name ::= (Letter | '_' | ':') (NameChar)*
1961     * [7] Nmtoken ::= (NameChar)+
1962     * *NOTE: [6] is implemented implicitly where required.
1963     */
1964    String readNmtoken(boolean isName) throws java.lang.Exception {
1965        char c;
1966
1967        if (USE_CHEATS) {
1968            loop: for (int i = readBufferPos; i < readBufferLength; i++) {
1969                switch (readBuffer[i]) {
1970                case '%':
1971
1972                    if (context == CONTEXT_DTD
1973                            || context == CONTEXT_ENTITYVALUE) {
1974                        break loop;
1975                    } // else fall through...
1976
1977                case '<':
1978                case '>':
1979                case '&':
1980                case ',':
1981                case '|':
1982                case '*':
1983                case '+':
1984                case '?':
1985                case ')':
1986                case '=':
1987                case '\'':
1988                case '"':
1989                case '[':
1990                case ' ':
1991                case '\t':
1992                case '\r':
1993                case '\n':
1994                case ';':
1995                case '/':
1996                case '#':
1997
1998                    int start = readBufferPos;
1999
2000                    if (i == start) {
2001                        error("name expected", readBuffer[i], null);
2002                    }
2003
2004                    readBufferPos = i;
2005                    return intern(readBuffer, start, i - start);
2006                }
2007            }
2008        }
2009
2010        nameBufferPos = 0;
2011
2012        // Read the first character.
2013        /*loop: */while (true) {
2014            c = readCh();
2015
2016            switch (c) {
2017            case '%':
2018            case '<':
2019            case '>':
2020            case '&':
2021            case ',':
2022            case '|':
2023            case '*':
2024            case '+':
2025            case '?':
2026            case ')':
2027            case '=':
2028            case '\'':
2029            case '"':
2030            case '[':
2031            case ' ':
2032            case '\t':
2033            case '\n':
2034            case '\r':
2035            case ';':
2036            case '/':
2037                unread(c);
2038
2039                if (nameBufferPos == 0) {
2040                    error("name expected", null, null);
2041                }
2042
2043                String s = intern(nameBuffer, 0, nameBufferPos);
2044                nameBufferPos = 0;
2045                return s;
2046
2047            default:
2048                nameBuffer = (char[]) extendArray(nameBuffer, nameBuffer.length,
2049                        nameBufferPos);
2050                nameBuffer[nameBufferPos++] = c;
2051            }
2052        }
2053    }
2054
2055    /**
2056     * Read a literal.
2057     * [10] AttValue ::= '"' ([^&lt;&amp;"] | Reference)* '"'
2058     *                 | "'" ([^&lt;&amp;'] | Reference)* "'"
2059     * [11] SystemLiteral ::= '"' URLchar* '"' | "'" (URLchar - "'")* "'"
2060     * [13] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2061     * [9] EntityValue ::= '"' ([^%&amp;"] | PEReference | Reference)* '"'
2062     *                   | "'" ([^%&amp;'] | PEReference | Reference)* "'"
2063     */
2064    String readLiteral(int flags) throws java.lang.Exception {
2065        char delim;
2066        char c;
2067        int startLine = line;
2068
2069        // Find the delimiter.
2070        delim = readCh();
2071
2072        if (delim != '"' && delim != '\'' && delim != (char) 0) {
2073            error("expected '\"' or \"'\"", delim, null);
2074            return null;
2075        }
2076
2077        // Read the literal.
2078        try {
2079            c = readCh();
2080
2081            loop: while (c != delim) {
2082                switch (c) {
2083                // Literals never have line ends
2084                case '\n':
2085                case '\r':
2086                    c = ' ';
2087                    break;
2088
2089                // References may be allowed
2090                case '&':
2091
2092                    if ((flags & LIT_CHAR_REF) > 0) {
2093                        c = readCh();
2094
2095                        if (c == '#') {
2096                            parseCharRef();
2097                            c = readCh();
2098                            continue loop; // check the next character
2099                        } else if ((flags & LIT_ENTITY_REF) > 0) {
2100                            unread(c);
2101                            parseEntityRef(false);
2102                            c = readCh();
2103                            continue loop;
2104                        } else {
2105                            dataBufferAppend('&');
2106                        }
2107                    }
2108
2109                    break;
2110
2111                default:
2112                    break;
2113                }
2114
2115                dataBufferAppend(c);
2116                c = readCh();
2117            }
2118        } catch (EOFException e) {
2119            error("end of input while looking for delimiter (started on line "
2120                    + startLine + ')', null, Character.toString(delim));
2121        }
2122
2123        // Normalise whitespace if necessary.
2124        if ((flags & LIT_NORMALIZE) > 0) {
2125            dataBufferNormalize();
2126        }
2127
2128        // Return the value.
2129        return dataBufferToString();
2130    }
2131
2132    /**
2133     * Try reading external identifiers.
2134     * <p>The system identifier is not required for notations.
2135     * @param inNotation Are we in a notation?
2136     * @return A two-member String array containing the identifiers.
2137     */
2138    String[] readExternalIds(boolean inNotation) throws java.lang.Exception {
2139        String[] ids = new String[2];
2140
2141        if (tryRead("PUBLIC")) {
2142            requireWhitespace();
2143            ids[0] = readLiteral(LIT_NORMALIZE); // public id
2144
2145            if (inNotation) {
2146                skipWhitespace();
2147
2148                if (tryRead('"') || tryRead('\'')) {
2149                    ids[1] = readLiteral(0);
2150                }
2151            } else {
2152                requireWhitespace();
2153                ids[1] = readLiteral(0); // system id
2154            }
2155        } else if (tryRead("SYSTEM")) {
2156            requireWhitespace();
2157            ids[1] = readLiteral(0); // system id
2158        }
2159
2160        return ids;
2161    }
2162
2163    /**
2164     * Test if a character is whitespace.
2165     * <pre>
2166     * [1] S ::= (#x20 | #x9 | #xd | #xa)+
2167     * </pre>
2168     * @param c The character to test.
2169     * @return true if the character is whitespace.
2170     */
2171    final boolean isWhitespace(char c) {
2172        switch (c) {
2173        case 0x20:
2174        case 0x09:
2175        case 0x0d:
2176        case 0x0a:
2177            return true;
2178
2179        default:
2180            return false;
2181        }
2182    }
2183
2184    //////////////////////////////////////////////////////////////////////
2185    // Utility routines.
2186    //////////////////////////////////////////////////////////////////////
2187
2188    /**
2189     * Add a character to the data buffer.
2190     */
2191    void dataBufferAppend(char c) {
2192        // Expand buffer if necessary.
2193        if (dataBufferPos >= dataBuffer.length) {
2194
2195            // dataBufferAppend() gets called alot, so instead of
2196            // calling extendArray() here, we optimize the heck out of this
2197            // code.
2198            //dataBuffer = (char[]) extendArray(dataBuffer, dataBuffer.length,
2199            //        dataBufferPos);
2200
2201            final int currentSize = dataBuffer.length;
2202            int newSize = currentSize * 2;
2203
2204            if (newSize <= dataBufferPos) {
2205                newSize = dataBufferPos + 1;
2206            }
2207
2208            // Dwight Richards pointed out that newSize was ignored (11/03)
2209            char[] newArray = new char[newSize];
2210
2211            System.arraycopy(dataBuffer, 0, newArray, 0, currentSize);
2212            dataBuffer = newArray;
2213        }
2214        dataBuffer[dataBufferPos++] = c;
2215    }
2216
2217    /**
2218     * Add a string to the data buffer.
2219     */
2220    void dataBufferAppend(String s) {
2221        dataBufferAppend(s.toCharArray(), 0, s.length());
2222    }
2223
2224    /**
2225     * Append (part of) a character array to the data buffer.
2226     */
2227    void dataBufferAppend(char[] ch, int start, int length) {
2228        dataBuffer = (char[]) extendArray(dataBuffer, dataBuffer.length,
2229                dataBufferPos + length);
2230        System.arraycopy(ch, start, dataBuffer, dataBufferPos, length);
2231        dataBufferPos += length;
2232    }
2233
2234    /**
2235     * Normalise whitespace in the data buffer.
2236     */
2237    void dataBufferNormalize() {
2238        int i = 0;
2239        int j = 0;
2240        int end = dataBufferPos;
2241
2242        // Skip whitespace at the start.
2243        while (j < end && isWhitespace(dataBuffer[j])) {
2244            j++;
2245        }
2246
2247        // Skip whitespace at the end.
2248        while (end > j && isWhitespace(dataBuffer[end - 1])) {
2249            end--;
2250        }
2251
2252        // Start copying to the left.
2253        while (j < end) {
2254            char c = dataBuffer[j++];
2255
2256            // Normalise all other whitespace to
2257            // a single space.
2258            if (isWhitespace(c)) {
2259                while (j < end && isWhitespace(dataBuffer[j++])) {
2260                }
2261
2262                dataBuffer[i++] = ' ';
2263                dataBuffer[i++] = dataBuffer[j - 1];
2264            } else {
2265                dataBuffer[i++] = c;
2266            }
2267        }
2268
2269        // The new length is <= the old one.
2270        dataBufferPos = i;
2271    }
2272
2273    /**
2274     * Convert the data buffer to a string.
2275     * @see #intern(char[],int,int)
2276     */
2277    String dataBufferToString() {
2278        String s = new String(dataBuffer, 0, dataBufferPos);
2279        dataBufferPos = 0;
2280        return s;
2281    }
2282
2283    /**
2284     * Flush the contents of the data buffer to the handler, if
2285     * appropriate, and reset the buffer for new input.
2286     */
2287    void dataBufferFlush() throws java.lang.Exception {
2288        if (dataBufferPos > 0) {
2289            switch (currentElementContent) {
2290            case CONTENT_UNDECLARED:
2291            case CONTENT_EMPTY:
2292
2293                // do nothing
2294                break;
2295
2296            case CONTENT_MIXED:
2297            case CONTENT_ANY:
2298
2299                if (handler != null) {
2300                    handler.charData(dataBuffer, 0, dataBufferPos);
2301                }
2302
2303                break;
2304
2305            case CONTENT_ELEMENTS:
2306
2307                if (handler != null) {
2308                    handler.ignorableWhitespace(dataBuffer, 0, dataBufferPos);
2309                }
2310
2311                break;
2312            }
2313
2314            dataBufferPos = 0;
2315        }
2316    }
2317
2318    /**
2319     * Require a string to appear, or throw an exception.
2320     */
2321    void require(String delim) throws java.lang.Exception {
2322        char[] ch = delim.toCharArray();
2323
2324        for (char element : ch) {
2325            require(element);
2326        }
2327    }
2328
2329    /**
2330     * Require a character to appear, or throw an exception.
2331     */
2332    void require(char delim) throws java.lang.Exception {
2333        char c = readCh();
2334
2335        if (c != delim) {
2336            error("expected character", c, Character.toString(delim));
2337        }
2338    }
2339
2340    /**
2341     * Return an internalised version of a string.
2342     * <p>&AElig;lfred uses this method to create an internalised version
2343     * of all names and attribute values, so that it can test equality
2344     * with <code>==</code> instead of <code>String.equals()</code>.
2345     * <p>If you want to be able to test for equality in the same way,
2346     * you can use this method to internalise your own strings first:
2347     * <pre>
2348     * String PARA = handler.intern("PARA");
2349     * </pre>
2350     * <p>Note that this will not return the same results as String.intern().
2351     * @param s The string to internalise.
2352     * @return An internalised version of the string.
2353     * @see #intern(char[],int,int)
2354     * @see java.lang.String#intern
2355     */
2356    public String intern(String s) {
2357        char[] ch = s.toCharArray();
2358        return intern(ch, 0, ch.length);
2359    }
2360
2361    /**
2362     * Create an internalised string from a character array.
2363     * <p>This is much more efficient than constructing a non-internalised
2364     * string first, and then internalising it.
2365     * <p>Note that this will not return the same results as String.intern().
2366     * @param ch an array of characters for building the string.
2367     * @param start the starting position in the array.
2368     * @param length the number of characters to place in the string.
2369     * @return an internalised string.
2370     * @see #intern(String)
2371     * @see java.lang.String#intern
2372     */
2373    public String intern(char[] ch, int start, int length) {
2374        int index;
2375        int hash = 0;
2376
2377        // Generate a hash code.
2378        for (int i = start; i < start + length; i++) {
2379            hash = (hash << 1 & 0xffffff) + ch[i];
2380        }
2381
2382        hash = hash % SYMBOL_TABLE_LENGTH;
2383
2384        // Get the bucket.
2385        Object[] bucket = (Object[]) symbolTable[hash];
2386
2387        if (bucket == null) {
2388            symbolTable[hash] = bucket = new Object[8];
2389        }
2390
2391        // Search for a matching tuple, and
2392        // return the string if we find one.
2393        for (index = 0; index < bucket.length; index += 2) {
2394            char[] chFound = (char[]) bucket[index];
2395
2396            // Stop when we hit a null index.
2397            if (chFound == null) {
2398                break;
2399            }
2400
2401            // If they're the same length,
2402            // check for a match.
2403            // If the loop finishes, 'index' will
2404            // contain the current bucket
2405            // position.
2406            if (chFound.length == length) {
2407                for (int i = 0; i < chFound.length; i++) {
2408                    // Stop if there are no more tuples.
2409                    if (ch[start + i] != chFound[i]) {
2410                        break;
2411                    } else if (i == length - 1) {
2412                        // That's it, we have a match!
2413                        return (String) bucket[index + 1];
2414                    }
2415                }
2416            }
2417        }
2418
2419        // Not found -- we'll have to add it.
2420        // Do we have to grow the bucket?
2421        bucket = (Object[]) extendArray(bucket, bucket.length, index);
2422
2423        // OK, add it to the end of the
2424        // bucket.
2425        String s = new String(ch, start, length);
2426        bucket[index] = s.toCharArray();
2427        bucket[index + 1] = s;
2428        symbolTable[hash] = bucket;
2429        return s;
2430    }
2431
2432    /**
2433     * Ensure the capacity of an array, allocating a new one if
2434     * necessary.
2435     */
2436    Object extendArray(Object array, int currentSize, int requiredSize) {
2437        if (requiredSize < currentSize) {
2438            return array;
2439        } else {
2440            Object newArray = null;
2441            int newSize = currentSize * 2;
2442
2443            if (newSize <= requiredSize) {
2444                newSize = requiredSize + 1;
2445            }
2446
2447            // Dwight Richards pointed out that newSize was ignored (11/03)
2448            if (array instanceof char[]) {
2449                newArray = new char[newSize];
2450            } else if (array instanceof Object[]) {
2451                newArray = new Object[newSize];
2452            } else {
2453                throw new RuntimeException("Array must be char[] or Object[]");
2454            }
2455
2456            System.arraycopy(array, 0, newArray, 0, currentSize);
2457            return newArray;
2458        }
2459    }
2460
2461    //////////////////////////////////////////////////////////////////////
2462    // XML query routines.
2463    //////////////////////////////////////////////////////////////////////
2464    //
2465    // Elements
2466    //
2467
2468    /**
2469     * Get the declared elements for an XML document.
2470     * <p>The results will be valid only after the DTD (if any) has been
2471     * parsed.
2472     * @return An enumeration of all element types declared for this
2473     *         document (as Strings).
2474     * @see #getElementContentType
2475     * @see #getElementContentModel
2476     */
2477    public Enumeration declaredElements() {
2478        return elementInfo.keys();
2479    }
2480
2481    /**
2482     * Look up the content type of an element.
2483     * @param name The element type name.
2484     * @return An integer constant representing the content type.
2485     * @see #getElementContentModel
2486     * @see #CONTENT_UNDECLARED
2487     * @see #CONTENT_ANY
2488     * @see #CONTENT_EMPTY
2489     * @see #CONTENT_MIXED
2490     * @see #CONTENT_ELEMENTS
2491     */
2492    public int getElementContentType(String name) {
2493        Object[] element = (Object[]) elementInfo.get(name);
2494
2495        if (element == null) {
2496            return CONTENT_UNDECLARED;
2497        } else {
2498            return ((Integer) element[0]).intValue();
2499        }
2500    }
2501
2502    /**
2503     * Look up the content model of an element.
2504     * <p>The result will always be null unless the content type is
2505     * CONTENT_ELEMENTS or CONTENT_MIXED.
2506     * @param name The element type name.
2507     * @return The normalised content model, as a string.
2508     * @see #getElementContentType
2509     */
2510    public String getElementContentModel(String name) {
2511        Object[] element = (Object[]) elementInfo.get(name);
2512
2513        if (element == null) {
2514            return null;
2515        } else {
2516            return (String) element[1];
2517        }
2518    }
2519
2520    /**
2521     * Register an element.
2522     * Array format:
2523     *  element type
2524     *  attribute hash table
2525     */
2526    void setElement(String name, int contentType, String contentModel,
2527            Hashtable attributes) throws java.lang.Exception {
2528        Object[] element;
2529
2530        // Try looking up the element
2531        element = (Object[]) elementInfo.get(name);
2532
2533        // Make a new one if necessary.
2534        if (element == null) {
2535            element = new Object[3];
2536            element[0] = Integer.valueOf(CONTENT_UNDECLARED);
2537            element[1] = null;
2538            element[2] = null;
2539        } else if (contentType != CONTENT_UNDECLARED
2540                && ((Integer) element[0]).intValue() != CONTENT_UNDECLARED) {
2541            error("multiple declarations for element type", name, null);
2542            return;
2543        }
2544
2545        // Insert the content type, if any.
2546        if (contentType != CONTENT_UNDECLARED) {
2547            element[0] = Integer.valueOf(contentType);
2548        }
2549
2550        // Insert the content model, if any.
2551        if (contentModel != null) {
2552            element[1] = contentModel;
2553        }
2554
2555        // Insert the attributes, if any.
2556        if (attributes != null) {
2557            element[2] = attributes;
2558        }
2559
2560        // Save the element info.
2561        elementInfo.put(name, element);
2562    }
2563
2564    /**
2565     * Look up the attribute hash table for an element.
2566     * The hash table is the second item in the element array.
2567     */
2568    Hashtable getElementAttributes(String name) {
2569        Object[] element = (Object[]) elementInfo.get(name);
2570
2571        if (element == null) {
2572            return null;
2573        } else {
2574            return (Hashtable) element[2];
2575        }
2576    }
2577
2578    //
2579    // Attributes
2580    //
2581
2582    /**
2583     * Get the declared attributes for an element type.
2584     * @param elname The name of the element type.
2585     * @return An Enumeration of all the attributes declared for
2586     *         a specific element type.  The results will be valid only
2587     *         after the DTD (if any) has been parsed.
2588     * @see #getAttributeType
2589     * @see #getAttributeEnumeration
2590     * @see #getAttributeDefaultValueType
2591     * @see #getAttributeDefaultValue
2592     * @see #getAttributeExpandedValue
2593     */
2594    public Enumeration declaredAttributes(String elname) {
2595        Hashtable attlist = getElementAttributes(elname);
2596
2597        if (attlist == null) {
2598            return null;
2599        } else {
2600            return attlist.keys();
2601        }
2602    }
2603
2604    /**
2605     * Retrieve the declared type of an attribute.
2606     * @param name The name of the associated element.
2607     * @param aname The name of the attribute.
2608     * @return An integer constant representing the attribute type.
2609     * @see #ATTRIBUTE_UNDECLARED
2610     * @see #ATTRIBUTE_CDATA
2611     * @see #ATTRIBUTE_ID
2612     * @see #ATTRIBUTE_IDREF
2613     * @see #ATTRIBUTE_IDREFS
2614     * @see #ATTRIBUTE_ENTITY
2615     * @see #ATTRIBUTE_ENTITIES
2616     * @see #ATTRIBUTE_NMTOKEN
2617     * @see #ATTRIBUTE_NMTOKENS
2618     * @see #ATTRIBUTE_ENUMERATED
2619     * @see #ATTRIBUTE_NOTATION
2620     */
2621    public int getAttributeType(String name, String aname) {
2622        Object[] attribute = getAttribute(name, aname);
2623
2624        if (attribute == null) {
2625            return ATTRIBUTE_UNDECLARED;
2626        } else {
2627            return ((Integer) attribute[0]).intValue();
2628        }
2629    }
2630
2631    /**
2632     * Retrieve the allowed values for an enumerated attribute type.
2633     * @param name The name of the associated element.
2634     * @param aname The name of the attribute.
2635     * @return A string containing the token list.
2636     * @see #ATTRIBUTE_ENUMERATED
2637     * @see #ATTRIBUTE_NOTATION
2638     */
2639    public String getAttributeEnumeration(String name, String aname) {
2640        Object[] attribute = getAttribute(name, aname);
2641
2642        if (attribute == null) {
2643            return null;
2644        } else {
2645            return (String) attribute[3];
2646        }
2647    }
2648
2649    /**
2650     * Retrieve the default value of a declared attribute.
2651     * @param name The name of the associated element.
2652     * @param aname The name of the attribute.
2653     * @return The default value, or null if the attribute was
2654     *         #IMPLIED or simply undeclared and unspecified.
2655     * @see #getAttributeExpandedValue
2656     */
2657    public String getAttributeDefaultValue(String name, String aname) {
2658        Object[] attribute = getAttribute(name, aname);
2659
2660        if (attribute == null) {
2661            return null;
2662        } else {
2663            return (String) attribute[1];
2664        }
2665    }
2666
2667    /**
2668     * Retrieve the expanded value of a declared attribute.
2669     * <p>All general entities will be expanded.
2670     * @param name The name of the associated element.
2671     * @param aname The name of the attribute.
2672     * @return The expanded default value, or null if the attribute was
2673     *         #IMPLIED or simply undeclared
2674     * @see #getAttributeDefaultValue
2675     */
2676    public String getAttributeExpandedValue(String name, String aname) {
2677        Object[] attribute = getAttribute(name, aname);
2678
2679        if (attribute == null) {
2680            return null;
2681        } else if (attribute[4] == null && attribute[1] != null) {
2682            try {
2683                pushString(null, (char) 0 + (String) attribute[1] + (char) 0);
2684                attribute[4] = readLiteral(
2685                        LIT_NORMALIZE | LIT_CHAR_REF | LIT_ENTITY_REF);
2686            } catch (Exception ex) {
2687                // We could ignore this and return but instead return here.
2688                return (String) attribute[4];
2689            }
2690        }
2691
2692        return (String) attribute[4];
2693    }
2694
2695    /**
2696     * Retrieve the default value type of a declared attribute.
2697     * @param name The name of the element.
2698     * @param aname The name of the attribute.
2699     * @return ATTRIBUTE_DEFAULT_UNDECLARED if the attribute
2700     * cannot be found, otherwise return an integer.
2701     * @see #ATTRIBUTE_DEFAULT_SPECIFIED
2702     * @see #ATTRIBUTE_DEFAULT_IMPLIED
2703     * @see #ATTRIBUTE_DEFAULT_REQUIRED
2704     * @see #ATTRIBUTE_DEFAULT_FIXED
2705     */
2706    public int getAttributeDefaultValueType(String name, String aname) {
2707        Object[] attribute = getAttribute(name, aname);
2708
2709        if (attribute == null) {
2710            return ATTRIBUTE_DEFAULT_UNDECLARED;
2711        } else {
2712            return ((Integer) attribute[2]).intValue();
2713        }
2714    }
2715
2716    /**
2717     * Register an attribute declaration for later retrieval.
2718     * Format:
2719     * - String type
2720     * - String default value
2721     * - int value type
2722     * *TODO: do something with attribute types.
2723     */
2724    void setAttribute(String elName, String name, int type, String enumeration,
2725            String value, int valueType) throws java.lang.Exception {
2726        Hashtable attlist;
2727        Object[] attribute;
2728
2729        // Create a new hashtable if necessary.
2730        attlist = getElementAttributes(elName);
2731
2732        if (attlist == null) {
2733            attlist = new Hashtable();
2734        }
2735
2736        // Check that the attribute doesn't
2737        // already exist!
2738        if (attlist.get(name) != null) {
2739            return;
2740        } else {
2741            attribute = new Object[5];
2742            attribute[0] = Integer.valueOf(type);
2743            attribute[1] = value;
2744            attribute[2] = Integer.valueOf(valueType);
2745            attribute[3] = enumeration;
2746            attribute[4] = null;
2747            attlist.put(name.intern(), attribute);
2748
2749            // Use CONTENT_UNDECLARED to avoid overwriting
2750            // existing element declaration.
2751            setElement(elName, CONTENT_UNDECLARED, null, attlist);
2752        }
2753    }
2754
2755    /**
2756     * Retrieve the three-member array representing an
2757     * attribute declaration.
2758     * @param elName The name of the element.
2759     * @param name The name of the attribute.
2760     */
2761    Object[] getAttribute(String elName, String name) {
2762        Hashtable attlist;
2763        Object[] attribute;
2764
2765        attlist = getElementAttributes(elName);
2766
2767        if (attlist == null) {
2768            return null;
2769        }
2770
2771        attribute = (Object[]) attlist.get(name);
2772        return attribute;
2773    }
2774
2775    //
2776    // Entities
2777    //
2778
2779    /**
2780     * Get declared entities.
2781     * @return An Enumeration of all the entities declared for
2782     *         this XML document.  The results will be valid only
2783     *         after the DTD (if any) has been parsed.
2784     * @see #getEntityType
2785     * @see #getEntityPublicId
2786     * @see #getEntitySystemId
2787     * @see #getEntityValue
2788     * @see #getEntityNotationName
2789     */
2790    public Enumeration declaredEntities() {
2791        return entityInfo.keys();
2792    }
2793
2794    /** Return the current element.
2795     *  @return The current Element.
2796     */
2797    public String getCurrentElement() {
2798        // Ptolemy localization for MoMLParser so that we
2799        // can get the currentElement from within MoMLParser.attribute()
2800        return currentElement;
2801    }
2802
2803    /**
2804     * Find the type of an entity.
2805     * @param ename The name of the entity.
2806     * @return An integer constant representing the entity type.
2807     * @see #ENTITY_UNDECLARED
2808     * @see #ENTITY_INTERNAL
2809     * @see #ENTITY_NDATA
2810     * @see #ENTITY_TEXT
2811     */
2812    public int getEntityType(String ename) {
2813        Object[] entity = (Object[]) entityInfo.get(ename);
2814
2815        if (entity == null) {
2816            return ENTITY_UNDECLARED;
2817        } else {
2818            return ((Integer) entity[0]).intValue();
2819        }
2820    }
2821
2822    /**
2823     * Return an external entity's public identifier, if any.
2824     * @param ename The name of the external entity.
2825     * @return The entity's system identifier, or null if the
2826     *         entity was not declared, if it is not an
2827     *         external entity, or if no public identifier was
2828     *         provided.
2829     * @see #getEntityType
2830     */
2831    public String getEntityPublicId(String ename) {
2832        Object[] entity = (Object[]) entityInfo.get(ename);
2833
2834        if (entity == null) {
2835            return null;
2836        } else {
2837            return (String) entity[1];
2838        }
2839    }
2840
2841    /**
2842     * Return an external entity's system identifier.
2843     * @param ename The name of the external entity.
2844     * @return The entity's system identifier, or null if the
2845     *         entity was not declared, or if it is not an
2846     *         external entity.
2847     * @see #getEntityType
2848     */
2849    public String getEntitySystemId(String ename) {
2850        Object[] entity = (Object[]) entityInfo.get(ename);
2851
2852        if (entity == null) {
2853            return null;
2854        } else {
2855            return (String) entity[2];
2856        }
2857    }
2858
2859    /**
2860     * Return the value of an internal entity.
2861     * @param ename The name of the internal entity.
2862     * @return The entity's value, or null if the entity was
2863     *         not declared, or if it is not an internal entity.
2864     * @see #getEntityType
2865     */
2866    public String getEntityValue(String ename) {
2867        Object[] entity = (Object[]) entityInfo.get(ename);
2868
2869        if (entity == null) {
2870            return null;
2871        } else {
2872            return (String) entity[3];
2873        }
2874    }
2875
2876    /**
2877     * Get the notation name associated with an NDATA entity.
2878     * @param eName The NDATA entity name.
2879     * @return The associated notation name, or null if the
2880     *         entity was not declared, or if it is not an
2881     *         NDATA entity.
2882     * @see #getEntityType
2883     */
2884    public String getEntityNotationName(String eName) {
2885        Object[] entity = (Object[]) entityInfo.get(eName);
2886
2887        if (entity == null) {
2888            return null;
2889        } else {
2890            return (String) entity[4];
2891        }
2892    }
2893
2894    /**
2895     * Register an entity declaration for later retrieval.
2896     */
2897    void setInternalEntity(String eName, String value) {
2898        setEntity(eName, ENTITY_INTERNAL, null, null, value, null);
2899    }
2900
2901    /**
2902     * Register an external data entity.
2903     */
2904    void setExternalDataEntity(String eName, String pubid, String sysid,
2905            String nName) {
2906        setEntity(eName, ENTITY_NDATA, pubid, sysid, null, nName);
2907    }
2908
2909    /**
2910     * Register an external text entity.
2911     */
2912    void setExternalTextEntity(String eName, String pubid, String sysid) {
2913        setEntity(eName, ENTITY_TEXT, pubid, sysid, null, null);
2914    }
2915
2916    /**
2917     * Register an entity declaration for later retrieval.
2918     */
2919    void setEntity(String eName, int eClass, String pubid, String sysid,
2920            String value, String nName) {
2921        Object[] entity;
2922
2923        if (entityInfo.get(eName) == null) {
2924            entity = new Object[5];
2925            entity[0] = Integer.valueOf(eClass);
2926            entity[1] = pubid;
2927            entity[2] = sysid;
2928            entity[3] = value;
2929            entity[4] = nName;
2930
2931            entityInfo.put(eName, entity);
2932        }
2933    }
2934
2935    //
2936    // Notations.
2937    //
2938
2939    /**
2940     * Get declared notations.
2941     * @return An Enumeration of all the notations declared for
2942     *         this XML document.  The results will be valid only
2943     *         after the DTD (if any) has been parsed.
2944     * @see #getNotationPublicId
2945     * @see #getNotationSystemId
2946     */
2947    public Enumeration declaredNotations() {
2948        return notationInfo.keys();
2949    }
2950
2951    /**
2952     * Look up the public identifier for a notation.
2953     * You will normally use this method to look up a notation
2954     * that was provided as an attribute value or for an NDATA entity.
2955     * @param nname The name of the notation.
2956     * @return A string containing the public identifier, or null
2957     *         if none was provided or if no such notation was
2958     *         declared.
2959     * @see #getNotationSystemId
2960     */
2961    public String getNotationPublicId(String nname) {
2962        Object[] notation = (Object[]) notationInfo.get(nname);
2963
2964        if (notation == null) {
2965            return null;
2966        } else {
2967            return (String) notation[0];
2968        }
2969    }
2970
2971    /**
2972     * Look up the system identifier for a notation.
2973     * You will normally use this method to look up a notation
2974     * that was provided as an attribute value or for an NDATA entity.
2975     * @param nname The name of the notation.
2976     * @return A string containing the system identifier, or null
2977     *         if no such notation was declared.
2978     * @see #getNotationPublicId
2979     */
2980    public String getNotationSystemId(String nname) {
2981        Object[] notation = (Object[]) notationInfo.get(nname);
2982
2983        if (notation == null) {
2984            return null;
2985        } else {
2986            return (String) notation[1];
2987        }
2988    }
2989
2990    /**
2991     * Register a notation declaration for later retrieval.
2992     * Format:
2993     * - public id
2994     * - system id
2995     */
2996    void setNotation(String nname, String pubid, String sysid)
2997            throws java.lang.Exception {
2998        Object[] notation;
2999
3000        if (notationInfo.get(nname) == null) {
3001            notation = new Object[2];
3002            notation[0] = pubid;
3003            notation[1] = sysid;
3004            notationInfo.put(nname, notation);
3005        } else {
3006            error("multiple declarations of notation", nname, null);
3007        }
3008    }
3009
3010    //
3011    // Location.
3012    //
3013
3014    /**
3015     * Return the current line number.
3016     * @return The current line number.
3017     */
3018    public int getLineNumber() {
3019        return line;
3020    }
3021
3022    /**
3023     * Return the current column number.
3024     * @return The current column number.
3025     */
3026    public int getColumnNumber() {
3027        return column;
3028    }
3029
3030    //////////////////////////////////////////////////////////////////////
3031    // High-level I/O.
3032    //////////////////////////////////////////////////////////////////////
3033
3034    /**
3035     * Read a single character from the readBuffer.
3036     * <p>The readDataChunk() method maintains the buffer.
3037     * <p>If we hit the end of an entity, try to pop the stack and
3038     * keep going.
3039     * <p>(This approach doesn't really enforce XML's rules about
3040     * entity boundaries, but this is not currently a validating
3041     * parser).
3042     * <p>This routine also attempts to keep track of the current
3043     * position in external entities, but it's not entirely accurate.
3044     * @return The next available input character.
3045     * @see #unread(char)
3046     * @see #readDataChunk
3047     * @see #readBuffer
3048     * @see #line
3049     */
3050    char readCh() throws java.lang.Exception {
3051        char c;
3052
3053        // As long as there's nothing in the
3054        // read buffer, try reading more data
3055        // (for an external entity) or popping
3056        // the entity stack (for either).
3057        while (readBufferPos >= readBufferLength) {
3058            switch (sourceType) {
3059            case INPUT_READER:
3060            case INPUT_EXTERNAL:
3061            case INPUT_STREAM:
3062                readDataChunk();
3063
3064                while (readBufferLength < 1) {
3065                    popInput();
3066
3067                    if (readBufferLength < 1) {
3068                        readDataChunk();
3069                    }
3070                }
3071
3072                break;
3073
3074            default:
3075                popInput();
3076                break;
3077            }
3078        }
3079
3080        c = readBuffer[readBufferPos++];
3081
3082        // This is a particularly nasty bit
3083        // of code, that checks for a parameter
3084        // entity reference but peeks ahead to
3085        // catch the '%' in parameter entity
3086        // declarations.
3087        if (c == '%'
3088                && (context == CONTEXT_DTD || context == CONTEXT_ENTITYVALUE)) {
3089            char c2 = readCh();
3090            unread(c2);
3091
3092            if (!isWhitespace(c2)) {
3093                parsePEReference(context == CONTEXT_ENTITYVALUE);
3094                return readCh();
3095            }
3096        }
3097
3098        if (c == '\n') {
3099            line++;
3100            column = 0;
3101        } else {
3102            column++;
3103        }
3104
3105        return c;
3106    }
3107
3108    /**
3109     * Push a single character back onto the current input stream.
3110     * <p>This method usually pushes the character back onto
3111     * the readBuffer.
3112     * <p>I don't think that this would ever be called with
3113     * readBufferPos = 0, because the methods always reads a character
3114     * before unreading it, but just in case, I've added a boundary
3115     * condition.
3116     * @param c The character to push back.
3117     * @see #readCh
3118     * @see #unread(char[], int)
3119     * @see #readBuffer
3120     */
3121    void unread(char c) throws java.lang.Exception {
3122        // Normal condition.
3123        if (c == '\n') {
3124            line--;
3125            column = -1;
3126        }
3127
3128        if (readBufferPos > 0) {
3129            readBuffer[--readBufferPos] = c;
3130        } else {
3131            pushString(null, Character.toString(c));
3132        }
3133    }
3134
3135    /**
3136     * Push a char array back onto the current input stream.
3137     * <p>NOTE: you must <em>never</em> push back characters that you
3138     * haven't actually read: use pushString() instead.
3139     * @see #readCh
3140     * @see #unread(char)
3141     * @see #readBuffer
3142     * @see #pushString
3143     */
3144    void unread(char[] ch, int length) throws java.lang.Exception {
3145        for (int i = 0; i < length; i++) {
3146            if (ch[i] == '\n') {
3147                line--;
3148                column = -1;
3149            }
3150        }
3151
3152        if (length < readBufferPos) {
3153            readBufferPos -= length;
3154        } else {
3155            pushCharArray(null, ch, 0, length);
3156            sourceType = INPUT_BUFFER;
3157        }
3158    }
3159
3160    /**
3161     * Push a new external input source.
3162     * <p>The source will be either an external text entity, or the DTD
3163     * external subset.
3164     * <p>TO DO: Right now, this method always attempts to autodetect
3165     * the encoding; in the future, it should allow the caller to
3166     * request an encoding explicitly, and it should also look at the
3167     * headers with an HTTP connection.
3168     * @param ename
3169     * @param publicId
3170     * @param systemId
3171     * @param reader
3172     * @param stream
3173     * @param encoding
3174     * @exception Exception
3175     * @see XmlHandler#resolveEntity
3176     * @see #pushString
3177     * @see #sourceType
3178     * @see #pushInput
3179     * @see #detectEncoding
3180     * @see #sourceType
3181     * @see #readBuffer
3182     */
3183    void pushURL(String ename, String publicId, String systemId, Reader reader,
3184            InputStream stream, String encoding) throws java.lang.Exception {
3185        URL url;
3186        boolean ignoreEncoding = false;
3187
3188        // Push the existing status.
3189        pushInput(ename);
3190
3191        // Create a new read buffer.
3192        // (Note the four-character margin)
3193        readBuffer = new char[READ_BUFFER_MAX + 4];
3194        readBufferPos = 0;
3195        readBufferLength = 0;
3196        readBufferOverflow = -1;
3197        is = null;
3198        line = 1;
3199
3200        currentByteCount = 0;
3201
3202        // Flush any remaining data.
3203        dataBufferFlush();
3204
3205        // Make the URL absolute.
3206        if (systemId != null && externalEntity != null) {
3207            systemId = new URL(externalEntity.getURL(), systemId).toString();
3208        } else if (baseURI != null) {
3209            try {
3210                systemId = new URL(new URL(baseURI), systemId).toString();
3211            } catch (Throwable throwable) {
3212                // Ignore this and stick with the old systemId
3213            }
3214        }
3215
3216        // See if the application wants to
3217        // redirect the system ID and/or
3218        // supply its own character stream.
3219        if (systemId != null && handler != null) {
3220            Object input = handler.resolveEntity(publicId, systemId);
3221
3222            if (input != null) {
3223                if (input instanceof String) {
3224                    systemId = (String) input;
3225                } else if (input instanceof InputStream) {
3226                    stream = (InputStream) input;
3227                } else if (input instanceof Reader) {
3228                    reader = (Reader) input;
3229                }
3230            }
3231        }
3232
3233        // Start the entity.
3234        if (handler != null) {
3235            if (systemId != null) {
3236                handler.startExternalEntity(systemId);
3237            } else {
3238                handler.startExternalEntity("[external stream]");
3239            }
3240        }
3241
3242        // Figure out what we're reading from.
3243        if (reader != null) {
3244            // There's an explicit character stream.
3245            sourceType = INPUT_READER;
3246            this.reader = reader;
3247            tryEncodingDecl(true);
3248            return;
3249        } else if (stream != null) {
3250            sourceType = INPUT_STREAM;
3251            is = stream;
3252        } else {
3253            // We have to open our own stream
3254            // to the URL.
3255            // Set the new status
3256            sourceType = INPUT_EXTERNAL;
3257            url = new URL(systemId);
3258
3259            externalEntity = url.openConnection();
3260            externalEntity.connect();
3261            is = externalEntity.getInputStream();
3262        }
3263
3264        // If we get to here, there must be
3265        // an InputStream available.
3266        if (!is.markSupported()) {
3267            is = new BufferedInputStream(is);
3268        }
3269
3270        // Attempt to detect the encoding.
3271        if (encoding == null && externalEntity != null) {
3272            encoding = externalEntity.getContentEncoding();
3273        }
3274
3275        if (encoding != null) {
3276            checkEncoding(encoding, false);
3277            ignoreEncoding = true;
3278        } else {
3279            detectEncoding();
3280            ignoreEncoding = false;
3281        }
3282
3283        // Read an XML or text declaration.
3284        tryEncodingDecl(ignoreEncoding);
3285    }
3286
3287    /**
3288     * Check for an encoding declaration.
3289     */
3290    void tryEncodingDecl(boolean ignoreEncoding) throws java.lang.Exception {
3291        // Read the XML/Encoding declaration.
3292        if (tryRead("<?xml")) {
3293            if (tryWhitespace()) {
3294                if (inputStack.size() > 0) {
3295                    parseTextDecl(ignoreEncoding);
3296                } else {
3297                    parseXMLDecl(ignoreEncoding);
3298                }
3299            } else {
3300                unread("xml".toCharArray(), 3);
3301                parsePI();
3302            }
3303        }
3304    }
3305
3306    /**
3307     * Attempt to detect the encoding of an entity.
3308     * <p>The trick here (as suggested in the XML standard) is that
3309     * any entity not in UTF-8, or in UCS-2 with a byte-order mark,
3310     * <b>must</b> begin with an XML declaration or an encoding
3311     * declaration; we simply have to look for "&lt;?XML" in various
3312     * encodings.
3313     * <p>This method has no way to distinguish among 8-bit encodings.
3314     * Instead, it assumes UTF-8, then (possibly) revises its assumption
3315     * later in checkEncoding().  Any ASCII-derived 8-bit encoding
3316     * should work, but most will be rejected later by checkEncoding().
3317     * <p>I don't currently detect EBCDIC, since I'm concerned that it
3318     * could also be a valid UTF-8 sequence; I'll have to do more checking
3319     * later.
3320     * @see #tryEncoding(byte[], byte, byte, byte, byte)
3321     * @see #tryEncoding(byte[], byte, byte)
3322     * @see #checkEncoding
3323     * @see #read8bitEncodingDeclaration
3324     */
3325    void detectEncoding() throws java.lang.Exception {
3326        byte[] signature = new byte[4];
3327
3328        // Read the first four bytes for
3329        // autodetection.
3330        is.mark(4);
3331        int bytesRead = is.read(signature);
3332        if (bytesRead != signature.length) {
3333            throw new IOException("Read only " + bytesRead
3334                    + " bytes instead of " + signature.length);
3335
3336        }
3337        is.reset();
3338
3339        // Look for a known signature.
3340        if (tryEncoding(signature, (byte) 0x00, (byte) 0x00, (byte) 0x00,
3341                (byte) 0x3c)) {
3342            // UCS-4 must begin with "<!XML"
3343            // 0x00 0x00 0x00 0x3c: UCS-4, big-endian (1234)
3344            encoding = ENCODING_UCS_4_1234;
3345        } else if (tryEncoding(signature, (byte) 0x3c, (byte) 0x00, (byte) 0x00,
3346                (byte) 0x00)) {
3347            // UCS-4 must begin with "<!XML"
3348            // 0x3c 0x00 0x00 0x00: UCS-4, little-endian (4321)
3349            encoding = ENCODING_UCS_4_4321;
3350        } else if (tryEncoding(signature, (byte) 0x00, (byte) 0x00, (byte) 0x3c,
3351                (byte) 0x00)) {
3352            // UCS-4 must begin with "<!XML"
3353            // 0x00 0x00 0x3c 0x00: UCS-4, unusual (2143)
3354            encoding = ENCODING_UCS_4_2143;
3355        } else if (tryEncoding(signature, (byte) 0x00, (byte) 0x3c, (byte) 0x00,
3356                (byte) 0x00)) {
3357            // UCS-4 must begin with "<!XML"
3358            // 0x00 0x3c 0x00 0x00: UCS-4, unusual (3421)
3359            encoding = ENCODING_UCS_4_3412;
3360        } else if (tryEncoding(signature, (byte) 0xfe, (byte) 0xff)) {
3361            // UCS-2 with a byte-order marker.
3362            // 0xfe 0xff: UCS-2, big-endian (12)
3363            encoding = ENCODING_UCS_2_12;
3364            is.read();
3365            is.read();
3366        } else if (tryEncoding(signature, (byte) 0xff, (byte) 0xfe)) {
3367            // UCS-2 with a byte-order marker.
3368            // 0xff 0xfe: UCS-2, little-endian (21)
3369            encoding = ENCODING_UCS_2_21;
3370            is.read();
3371            is.read();
3372        } else if (tryEncoding(signature, (byte) 0x00, (byte) 0x3c, (byte) 0x00,
3373                (byte) 0x3f)) {
3374            // UCS-2 without a BOM must begin with "<?XML"
3375            // 0x00 0x3c 0x00 0x3f: UCS-2, big-endian, no byte-order mark
3376            encoding = ENCODING_UCS_2_12;
3377            error("no byte-order mark for UCS-2 entity", null, null);
3378        } else if (tryEncoding(signature, (byte) 0x3c, (byte) 0x00, (byte) 0x3f,
3379                (byte) 0x00)) {
3380            // UCS-2 without a BOM must begin with "<?XML"
3381            // 0x3c 0x00 0x3f 0x00: UCS-2, little-endian, no byte-order mark
3382            encoding = ENCODING_UCS_2_21;
3383            error("no byte-order mark for UCS-2 entity", null, null);
3384        } else if (tryEncoding(signature, (byte) 0x3c, (byte) 0x3f, (byte) 0x78,
3385                (byte) 0x6d)) {
3386            // Some kind of 8-bit encoding with "<?XML"
3387            // 0x3c 0x3f 0x78 0x6d: UTF-8 or other 8-bit markup (read ENCODING)
3388            encoding = ENCODING_UTF_8;
3389            read8bitEncodingDeclaration();
3390        } else {
3391            // Some kind of 8-bit encoding without "<?XML"
3392            // (otherwise) UTF-8 without encoding/XML declaration
3393            encoding = ENCODING_UTF_8;
3394        }
3395    }
3396
3397    /**
3398     * Check for a four-byte signature.
3399     * <p>Utility routine for detectEncoding().
3400     * <p>Always looks for some part of "&lt;?XML" in a specific encoding.
3401     * @param sig The first four bytes read.
3402     * @param b1 The first byte of the signature
3403     * @param b2 The second byte of the signature
3404     * @param b3 The third byte of the signature
3405     * @param b4 The fourth byte of the signature
3406     * @see #detectEncoding
3407     */
3408    boolean tryEncoding(byte[] sig, byte b1, byte b2, byte b3, byte b4) {
3409        return sig[0] == b1 && sig[1] == b2 && sig[2] == b3 && sig[3] == b4;
3410    }
3411
3412    /**
3413     * Check for a two-byte signature.
3414     * <p>Looks for a UCS-2 byte-order mark.
3415     * <p>Utility routine for detectEncoding().
3416     * @param sig The first four bytes read.
3417     * @param b1 The first byte of the signature
3418     * @param b2 The second byte of the signature
3419     * @see #detectEncoding
3420     */
3421    boolean tryEncoding(byte[] sig, byte b1, byte b2) {
3422        return sig[0] == b1 && sig[1] == b2;
3423    }
3424
3425    /**
3426     * This method pushes a string back onto input.
3427     * <p>It is useful either as the expansion of an internal entity,
3428     * or for backtracking during the parse.
3429     * <p>Call pushCharArray() to do the actual work.
3430     * @param s The string to push back onto input.
3431     * @see #pushCharArray
3432     */
3433    void pushString(String ename, String s) throws java.lang.Exception {
3434        char[] ch = s.toCharArray();
3435        pushCharArray(ename, ch, 0, ch.length);
3436    }
3437
3438    /**
3439     * Push a new internal input source.
3440     * <p>This method is useful for expanding an internal entity,
3441     * or for unreading a string of characters.  It creates a new
3442     * readBuffer containing the characters in the array, instead
3443     * of characters converted from an input byte stream.
3444     * <p>I've added a couple of optimisations: don't push zero-
3445     * length strings, and just push back a single character
3446     * for 1-character strings; this should save some time and memory.
3447     * @param ch The char array to push.
3448     * @see #pushString
3449     * @see #pushURL
3450     * @see #readBuffer
3451     * @see #sourceType
3452     * @see #pushInput
3453     */
3454    void pushCharArray(String ename, char[] ch, int start, int length)
3455            throws java.lang.Exception {
3456        // Push the existing status
3457        pushInput(ename);
3458        sourceType = INPUT_INTERNAL;
3459        readBuffer = ch;
3460        readBufferPos = start;
3461        readBufferLength = length;
3462        readBufferOverflow = -1;
3463    }
3464
3465    /**
3466     * Save the current input source onto the stack.
3467     * <p>This method saves all of the global variables associated with
3468     * the current input source, so that they can be restored when a new
3469     * input source has finished.  It also tests for entity recursion.
3470     * <p>The method saves the following global variables onto a stack
3471     * using a fixed-length array:
3472     * <ol>
3473     * <li>sourceType</li>
3474     * <li>externalEntity</li>
3475     * <li>readBuffer</li>
3476     * <li>readBufferPos</li>
3477     * <li>readBufferLength</li>
3478     * <li>line</li>
3479     * <li>encoding</li>
3480     * </ol>
3481     * @param ename The name of the entity (if any) causing the new input.
3482     * @see #popInput
3483     * @see #sourceType
3484     * @see #externalEntity
3485     * @see #readBuffer
3486     * @see #readBufferPos
3487     * @see #readBufferLength
3488     * @see #line
3489     * @see #encoding
3490     */
3491    void pushInput(String ename) throws java.lang.Exception {
3492        Object[] input = new Object[12];
3493
3494        // Check for entity recursion.
3495        if (ename != null) {
3496            Enumeration entities = entityStack.elements();
3497
3498            while (entities.hasMoreElements()) {
3499                String e = (String) entities.nextElement();
3500
3501                if (e.equals(ename)) {
3502                    error("recursive reference to entity", ename, null);
3503                }
3504            }
3505        }
3506
3507        entityStack.push(ename);
3508
3509        // Don't bother if there is no input.
3510        if (sourceType == INPUT_NONE) {
3511            return;
3512        }
3513
3514        // Set up a snapshot of the current
3515        // input source.
3516        input[0] = Integer.valueOf(sourceType);
3517        input[1] = externalEntity;
3518        input[2] = readBuffer;
3519        input[3] = Integer.valueOf(readBufferPos);
3520        input[4] = Integer.valueOf(readBufferLength);
3521        input[5] = Integer.valueOf(line);
3522        input[6] = Integer.valueOf(encoding);
3523        input[7] = Integer.valueOf(readBufferOverflow);
3524        input[8] = is;
3525        input[9] = Integer.valueOf(currentByteCount);
3526        input[10] = Integer.valueOf(column);
3527        input[11] = reader;
3528
3529        // Push it onto the stack.
3530        inputStack.push(input);
3531    }
3532
3533    /**
3534     * Restore a previous input source.
3535     * <p>This method restores all of the global variables associated with
3536     * the current input source.
3537     * @exception java.io.EOFException
3538     *    If there are no more entries on the input stack.
3539     * @see #pushInput
3540     * @see #sourceType
3541     * @see #externalEntity
3542     * @see #readBuffer
3543     * @see #readBufferPos
3544     * @see #readBufferLength
3545     * @see #line
3546     * @see #encoding
3547     */
3548    void popInput() throws java.lang.Exception {
3549        Object[] input;
3550
3551        switch (sourceType) {
3552        case INPUT_EXTERNAL:
3553            dataBufferFlush();
3554
3555            if (handler != null && externalEntity != null) {
3556                handler.endExternalEntity(externalEntity.getURL().toString());
3557            }
3558
3559            break;
3560
3561        case INPUT_STREAM:
3562            dataBufferFlush();
3563
3564            if (baseURI != null) {
3565                if (handler != null) {
3566                    handler.endExternalEntity(baseURI);
3567                }
3568            }
3569
3570            break;
3571
3572        case INPUT_READER:
3573            dataBufferFlush();
3574
3575            if (baseURI != null) {
3576                if (handler != null) {
3577                    handler.endExternalEntity(baseURI);
3578                }
3579            }
3580
3581            break;
3582        }
3583
3584        // Throw an EOFException if there
3585        // is nothing else to pop.
3586        if (inputStack.isEmpty()) {
3587            throw new EOFException("XML parser input stack was empty, "
3588                    + "end of file or xml fragment reached. "
3589                    + "Perhaps there is a missing '>' "
3590                    + "or a comment is unterminated by '->'?");
3591        } else {
3592            input = (Object[]) inputStack.pop();
3593            entityStack.pop();
3594        }
3595
3596        sourceType = ((Integer) input[0]).intValue();
3597        externalEntity = (URLConnection) input[1];
3598        readBuffer = (char[]) input[2];
3599        readBufferPos = ((Integer) input[3]).intValue();
3600        readBufferLength = ((Integer) input[4]).intValue();
3601        line = ((Integer) input[5]).intValue();
3602        encoding = ((Integer) input[6]).intValue();
3603        readBufferOverflow = ((Integer) input[7]).intValue();
3604        is = (InputStream) input[8];
3605        currentByteCount = ((Integer) input[9]).intValue();
3606        column = ((Integer) input[10]).intValue();
3607        reader = (Reader) input[11];
3608    }
3609
3610    /**
3611     * Return true if we can read the expected character.
3612     * <p>Note that the character will be removed from the input stream
3613     * on success, but will be put back on failure.  Do not attempt to
3614     * read the character again if the method succeeds.
3615     * @param delim The character that should appear next.  For a
3616     *              insensitive match, you must supply this in upper-case.
3617     * @return true if the character was successfully read, or false if
3618     *         it was not.
3619     * @see #tryRead(String)
3620     */
3621    boolean tryRead(char delim) throws java.lang.Exception {
3622        char c;
3623
3624        // Read the character
3625        c = readCh();
3626
3627        // Test for a match, and push the character
3628        // back if the match fails.
3629        if (c == delim) {
3630            return true;
3631        } else {
3632            unread(c);
3633            return false;
3634        }
3635    }
3636
3637    /**
3638     * Return true if we can read the expected string.
3639     * <p>This is simply a convenience method.
3640     * <p>Note that the string will be removed from the input stream
3641     * on success, but will be put back on failure.  Do not attempt to
3642     * read the string again if the method succeeds.
3643     * <p>This method will push back a character rather than an
3644     * array whenever possible (probably the majority of cases).
3645     * <p><b>NOTE:</b> This method currently has a hard-coded limit
3646     * of 100 characters for the delimiter.
3647     * @param delim The string that should appear next.
3648     * @return true if the string was successfully read, or false if
3649     *         it was not.
3650     * @see #tryRead(char)
3651     */
3652    boolean tryRead(String delim) throws java.lang.Exception {
3653        char[] ch = delim.toCharArray();
3654        char c;
3655
3656        // Compare the input, character-
3657        // by character.
3658        for (int i = 0; i < ch.length; i++) {
3659            c = readCh();
3660
3661            if (c != ch[i]) {
3662                unread(c);
3663
3664                if (i != 0) {
3665                    unread(ch, i);
3666                }
3667
3668                return false;
3669            }
3670        }
3671
3672        return true;
3673    }
3674
3675    /**
3676     * Return true if we can read some whitespace.
3677     * <p>This is simply a convenience method.
3678     * <p>This method will push back a character rather than an
3679     * array whenever possible (probably the majority of cases).
3680     * @return true if whitespace was found.
3681     */
3682    boolean tryWhitespace() throws java.lang.Exception {
3683        char c;
3684        c = readCh();
3685
3686        if (isWhitespace(c)) {
3687            skipWhitespace();
3688            return true;
3689        } else {
3690            unread(c);
3691            return false;
3692        }
3693    }
3694
3695    /**
3696     * Read all data until we find the specified string.
3697     * <p>This is especially useful for scanning marked sections.
3698     * <p>This is a a little inefficient right now, since it calls tryRead()
3699     * for every character.
3700     * @param delim The string delimiter
3701     * @see #tryRead(String)
3702     * @see #readCh
3703     */
3704    void parseUntil(String delim) throws java.lang.Exception {
3705        char c;
3706        int startLine = line;
3707
3708        try {
3709            while (!tryRead(delim)) {
3710                c = readCh();
3711                dataBufferAppend(c);
3712            }
3713        } catch (EOFException e) {
3714            error("end of input while looking for delimiter (started on line "
3715                    + startLine + ')', null, delim);
3716        }
3717    }
3718
3719    // Modified November 14, 1998 by Steve Neuendorffer
3720    // There was a bug because this was not skipping things that looked
3721    // like parameter entities properly.
3722    // Copied the appropriate code from readCh, excluding the lines referring to
3723    // '%'.
3724
3725    /**
3726     * Skip all data until we find the specified string.
3727     * <p>This is especially useful for scanning comments.
3728     * <p>This is a a little inefficient right now, since it calls tryRead()
3729     * for every character.
3730     * @param delim The string delimiter
3731     * @see #readCh
3732     */
3733    void skipUntil(String delim) throws java.lang.Exception {
3734        while (!tryRead(delim)) {
3735            char c;
3736
3737            // As long as there's nothing in the
3738            // read buffer, try reading more data
3739            // (for an external entity) or popping
3740            // the entity stack (for either).
3741            while (readBufferPos >= readBufferLength) {
3742                switch (sourceType) {
3743                case INPUT_READER:
3744                case INPUT_EXTERNAL:
3745                case INPUT_STREAM:
3746                    readDataChunk();
3747
3748                    while (readBufferLength < 1) {
3749                        popInput();
3750
3751                        if (readBufferLength < 1) {
3752                            readDataChunk();
3753                        }
3754                    }
3755
3756                    break;
3757
3758                default:
3759                    popInput();
3760                    break;
3761                }
3762            }
3763
3764            c = readBuffer[readBufferPos++];
3765
3766            if (c == '\n') {
3767                line++;
3768                column = 0;
3769            } else {
3770                column++;
3771            }
3772        }
3773    }
3774
3775    /**
3776     * Read just the encoding declaration (or XML declaration) at the
3777     * start of an external entity.
3778     * When this method is called, we know that the declaration is
3779     * present (or appears to be).  We also know that the entity is
3780     * in some sort of ASCII-derived 8-bit encoding.
3781     * The idea of this is to let us read what the 8-bit encoding is
3782     * before we've committed to converting any more of the file; the
3783     * XML or encoding declaration must be in 7-bit ASCII, so we're
3784     * safe as long as we don't go past it.
3785     */
3786    void read8bitEncodingDeclaration() throws java.lang.Exception {
3787        int ch;
3788        readBufferPos = readBufferLength = 0;
3789
3790        while (true) {
3791            ch = is.read();
3792            readBuffer[readBufferLength++] = (char) ch;
3793
3794            switch (ch) {
3795            case '>':
3796                return;
3797
3798            case -1:
3799                error("end of file before end of XML or encoding declaration.",
3800                        null, "?>");
3801                return;
3802            }
3803
3804            if (readBuffer.length == readBufferLength) {
3805                error("unfinished XML or encoding declaration", null, null);
3806            }
3807        }
3808    }
3809
3810    //////////////////////////////////////////////////////////////////////
3811    // Low-level I/O.
3812    //////////////////////////////////////////////////////////////////////
3813
3814    /**
3815     * Read a chunk of data from an external input source.
3816     * <p>This is simply a front-end that fills the rawReadBuffer
3817     * with bytes, then calls the appropriate encoding handler.
3818     * @see #encoding
3819     * @see #rawReadBuffer
3820     * @see #readBuffer
3821     * @see #filterCR
3822     * @see #copyUtf8ReadBuffer
3823     * @see #copyIso8859_1ReadBuffer
3824     */
3825    void readDataChunk() throws java.lang.Exception {
3826        int count;
3827
3828        // See if we have any overflow.
3829        if (readBufferOverflow > -1) {
3830            readBuffer[0] = (char) readBufferOverflow;
3831            readBufferOverflow = -1;
3832            readBufferPos = 1;
3833            sawCR = true;
3834        } else {
3835            readBufferPos = 0;
3836            sawCR = false;
3837        }
3838
3839        // Special situation -- we're taking
3840        // input from a character stream.
3841        if (sourceType == INPUT_READER) {
3842            count = reader.read(readBuffer, readBufferPos, READ_BUFFER_MAX - 1);
3843
3844            if (count < 0) {
3845                readBufferLength = -1;
3846            } else {
3847                readBufferLength = readBufferPos + count;
3848                filterCR();
3849                sawCR = false;
3850            }
3851
3852            return;
3853        }
3854
3855        // Read as many bytes as possible
3856        // into the read buffer.
3857        count = is.read(rawReadBuffer, 0, READ_BUFFER_MAX);
3858
3859        // Dispatch to an encoding-specific
3860        // reader method to populate the
3861        // readBuffer.
3862        switch (encoding) {
3863        case ENCODING_UTF_8:
3864            copyUtf8ReadBuffer(count);
3865            break;
3866
3867        case ENCODING_ISO_8859_1:
3868            copyIso8859_1ReadBuffer(count);
3869            break;
3870
3871        case ENCODING_UCS_2_12:
3872            copyUcs2ReadBuffer(count, 8, 0);
3873            break;
3874
3875        case ENCODING_UCS_2_21:
3876            copyUcs2ReadBuffer(count, 0, 8);
3877            break;
3878
3879        case ENCODING_UCS_4_1234:
3880            copyUcs4ReadBuffer(count, 24, 16, 8, 0);
3881            break;
3882
3883        case ENCODING_UCS_4_4321:
3884            copyUcs4ReadBuffer(count, 0, 8, 16, 24);
3885            break;
3886
3887        case ENCODING_UCS_4_2143:
3888            copyUcs4ReadBuffer(count, 16, 24, 0, 8);
3889            break;
3890
3891        case ENCODING_UCS_4_3412:
3892            copyUcs4ReadBuffer(count, 8, 0, 24, 16);
3893            break;
3894        }
3895
3896        // Filter out all carriage returns
3897        // if we've seen any.
3898        if (sawCR) {
3899            filterCR();
3900            sawCR = false;
3901        }
3902
3903        // Reset the position.
3904        readBufferPos = 0;
3905        currentByteCount += count;
3906    }
3907
3908    /**
3909     * Filter carriage returns in the read buffer.
3910     * <p>CRLF becomes LF; CR becomes LF.
3911     * @see #readDataChunk
3912     * @see #readBuffer
3913     * @see #readBufferOverflow
3914     */
3915    void filterCR() {
3916        int i;
3917        int j;
3918
3919        readBufferOverflow = -1;
3920
3921        loop: for (i = 0, j = 0; j < readBufferLength; i++, j++) {
3922            switch (readBuffer[j]) {
3923            case '\r':
3924
3925                if (j == readBufferLength - 1) {
3926                    readBufferOverflow = '\r';
3927                    readBufferLength--;
3928                    break loop;
3929                } else if (readBuffer[j + 1] == '\n') {
3930                    j++;
3931                }
3932
3933                readBuffer[i] = '\n';
3934                break;
3935
3936            case '\n':
3937            default:
3938                readBuffer[i] = readBuffer[j];
3939                break;
3940            }
3941        }
3942
3943        readBufferLength = i;
3944    }
3945
3946    /**
3947     * Convert a buffer of UTF-8-encoded bytes into UTF-16 characters.
3948     * <p>When readDataChunk() calls this method, the raw bytes are in
3949     * rawReadBuffer, and the final characters will appear in
3950     * readBuffer.
3951     * <p>The tricky part of this is dealing with UTF-8 multi-byte
3952     * sequences, but it doesn't seem to slow things down too much.
3953     * @param count The number of bytes to convert.
3954     * @see #readDataChunk
3955     * @see #rawReadBuffer
3956     * @see #readBuffer
3957     * @see #getNextUtf8Byte
3958     */
3959    void copyUtf8ReadBuffer(int count) throws java.lang.Exception {
3960        int i = 0;
3961        int j = readBufferPos;
3962        int b1;
3963
3964        while (i < count) {
3965            b1 = rawReadBuffer[i++];
3966
3967            // Determine whether we are dealing
3968            // with a one-, two-, three-, or four-
3969            // byte sequence.
3970            if ((b1 & 0x80) == 0) {
3971                // 1-byte sequence: 000000000xxxxxxx = 0xxxxxxx
3972                readBuffer[j++] = (char) b1;
3973            } else if ((b1 & 0xe0) == 0xc0) {
3974                // 2-byte sequence: 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx
3975                readBuffer[j++] = (char) ((b1 & 0x1f) << 6
3976                        | getNextUtf8Byte(i++, count));
3977            } else if ((b1 & 0xf0) == 0xe0) {
3978                // 3-byte sequence: zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx
3979                readBuffer[j++] = (char) ((b1 & 0x0f) << 12
3980                        | getNextUtf8Byte(i++, count) << 6
3981                        | getNextUtf8Byte(i++, count));
3982            } else if ((b1 & 0xf8) == 0xf0) {
3983                // 4-byte sequence: 11101110wwwwzzzzyy + 110111yyyyxxxxxx
3984                //     = 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx
3985                // (uuuuu = wwww + 1)
3986                int b2 = getNextUtf8Byte(i++, count);
3987                int b3 = getNextUtf8Byte(i++, count);
3988                int b4 = getNextUtf8Byte(i++, count);
3989                readBuffer[j++] = (char) (0xd800
3990                        | ((b1 & 0x07) << 2 | ((b2 & 0x30) >> 4) - 1) << 6
3991                        | (b2 & 0x0f) << 2 | (b3 & 0x30) >> 4);
3992                readBuffer[j++] = (char) (0xdc | (b3 & 0x0f) << 6 | b4);
3993
3994                // TODO: test that surrogate value is legal.
3995            } else {
3996                // Otherwise, the 8th bit may not be set in UTF-8
3997                encodingError("bad start for UTF-8 multi-byte sequence", b1, i);
3998            }
3999
4000            if (readBuffer[j - 1] == '\r') {
4001                sawCR = true;
4002            }
4003        }
4004
4005        // How many characters have we read?
4006        readBufferLength = j;
4007    }
4008
4009    /**
4010     * Return the next byte value in a UTF-8 sequence.
4011     * If it is not possible to get a byte from the current
4012     * entity, throw an exception.
4013     * @param pos The current position in the rawReadBuffer.
4014     * @param count The number of bytes in the rawReadBuffer
4015     * @return The significant six bits of a non-initial byte in
4016     *         a UTF-8 sequence.
4017     * @exception EOFException If the sequence is incomplete.
4018     */
4019    int getNextUtf8Byte(int pos, int count) throws java.lang.Exception {
4020        int val;
4021
4022        // Take a character from the buffer
4023        // or from the actual input stream.
4024        if (pos < count) {
4025            val = rawReadBuffer[pos];
4026        } else {
4027            val = is.read();
4028
4029            if (val == -1) {
4030                encodingError("unfinished multi-byte UTF-8 sequence at EOF", -1,
4031                        pos);
4032            }
4033        }
4034
4035        // Check for the correct bits at the
4036        // start.
4037        if ((val & 0xc0) != 0x80) {
4038            encodingError("bad continuation of multi-byte UTF-8 sequence", val,
4039                    pos + 1);
4040        }
4041
4042        // Return the significant bits.
4043        return val & 0x3f;
4044    }
4045
4046    /**
4047     * Convert a buffer of ISO-8859-1-encoded bytes into UTF-16 characters.
4048     * <p>When readDataChunk() calls this method, the raw bytes are in
4049     * rawReadBuffer, and the final characters will appear in
4050     * readBuffer.
4051     * <p>This is a direct conversion, with no tricks.
4052     * @param count The number of bytes to convert.
4053     * @see #readDataChunk
4054     * @see #rawReadBuffer
4055     * @see #readBuffer
4056     */
4057    void copyIso8859_1ReadBuffer(int count) {
4058        int i;
4059        int j;
4060
4061        for (i = 0, j = readBufferPos; i < count; i++, j++) {
4062            readBuffer[j] = (char) (rawReadBuffer[i] & 0xff);
4063
4064            if (readBuffer[j] == '\r') {
4065                sawCR = true;
4066            }
4067        }
4068
4069        readBufferLength = j;
4070    }
4071
4072    /**
4073     * Convert a buffer of UCS-2-encoded bytes into UTF-16 characters.
4074     * <p>When readDataChunk() calls this method, the raw bytes are in
4075     * rawReadBuffer, and the final characters will appear in
4076     * readBuffer.
4077     * @param count The number of bytes to convert.
4078     * @param shift1 The number of bits to shift byte 1.
4079     * @param shift2 The number of bits to shift byte 2
4080     * @see #readDataChunk
4081     * @see #rawReadBuffer
4082     * @see #readBuffer
4083     */
4084    void copyUcs2ReadBuffer(int count, int shift1, int shift2)
4085            throws java.lang.Exception {
4086        int j = readBufferPos;
4087
4088        if (count > 0 && count % 2 != 0) {
4089            encodingError("odd number of bytes in UCS-2 encoding", -1, count);
4090        }
4091
4092        for (int i = 0; i < count; i += 2) {
4093            readBuffer[j++] = (char) ((rawReadBuffer[i] & 0xff) << shift1
4094                    | (rawReadBuffer[i + 1] & 0xff) << shift2);
4095
4096            if (readBuffer[j - 1] == '\r') {
4097                sawCR = true;
4098            }
4099        }
4100
4101        readBufferLength = j;
4102    }
4103
4104    /**
4105     * Convert a buffer of UCS-4-encoded bytes into UTF-16 characters.
4106     * <p>When readDataChunk() calls this method, the raw bytes are in
4107     * rawReadBuffer, and the final characters will appear in
4108     * readBuffer.
4109     * <p>Java has 16-bit chars, but this routine will attempt to use
4110     * surrogates to encoding values between 0x00010000 and 0x000fffff.
4111     * @param count The number of bytes to convert.
4112     * @param shift1 The number of bits to shift byte 1.
4113     * @param shift2 The number of bits to shift byte 2
4114     * @param shift3 The number of bits to shift byte 2
4115     * @param shift4 The number of bits to shift byte 2
4116     * @see #readDataChunk
4117     * @see #rawReadBuffer
4118     * @see #readBuffer
4119     */
4120    void copyUcs4ReadBuffer(int count, int shift1, int shift2, int shift3,
4121            int shift4) throws java.lang.Exception {
4122        int j = readBufferPos;
4123        int value;
4124
4125        if (count > 0 && count % 4 != 0) {
4126            encodingError(
4127                    "number of bytes in UCS-4 encoding not divisible by 4", -1,
4128                    count);
4129        }
4130
4131        for (int i = 0; i < count; i += 4) {
4132            value = (rawReadBuffer[i] & 0xff) << shift1
4133                    | (rawReadBuffer[i + 1] & 0xff) << shift2
4134                    | (rawReadBuffer[i + 2] & 0xff) << shift3
4135                    | (rawReadBuffer[i + 3] & 0xff) << shift4;
4136
4137            if (value < 0x0000ffff) {
4138                readBuffer[j++] = (char) value;
4139
4140                if (value == '\r') {
4141                    sawCR = true;
4142                }
4143            } else if (value < 0x000fffff) {
4144                readBuffer[j++] = (char) (0xd8 | (value & 0x000ffc00) >> 10);
4145                readBuffer[j++] = (char) (0xdc | value & 0x0003ff);
4146            } else {
4147                encodingError("value cannot be represented in UTF-16", value,
4148                        i);
4149            }
4150        }
4151
4152        readBufferLength = j;
4153    }
4154
4155    /**
4156     * Report a character encoding error.
4157     */
4158    void encodingError(String message, int value, int offset)
4159            throws java.lang.Exception {
4160        String uri;
4161
4162        if (value >= 0) {
4163            message = message + " (byte value: 0x" + Integer.toHexString(value)
4164                    + ')';
4165        }
4166
4167        if (externalEntity != null) {
4168            uri = externalEntity.getURL().toString();
4169        } else {
4170            uri = baseURI;
4171        }
4172
4173        handler.error(message, uri, -1, offset + currentByteCount);
4174    }
4175
4176    //////////////////////////////////////////////////////////////////////
4177    // Local Variables.
4178    //////////////////////////////////////////////////////////////////////
4179
4180    /**
4181     * Re-initialize the variables for each parse.
4182     */
4183    void initializeVariables() {
4184        // No errors; first lineb
4185        //errorCount = 0;
4186        line = 1;
4187        column = 0;
4188
4189        // Set up the buffers for data and names
4190        dataBufferPos = 0;
4191        dataBuffer = new char[DATA_BUFFER_INITIAL];
4192        nameBufferPos = 0;
4193        nameBuffer = new char[NAME_BUFFER_INITIAL];
4194
4195        // Set up the DTD hash tables
4196        elementInfo = new Hashtable();
4197        entityInfo = new Hashtable();
4198        notationInfo = new Hashtable();
4199
4200        // Set up the variables for the current
4201        // element context.
4202        currentElement = null;
4203        currentElementContent = CONTENT_UNDECLARED;
4204
4205        // Set up the input variables
4206        sourceType = INPUT_NONE;
4207        inputStack = new Stack();
4208        entityStack = new Stack();
4209        externalEntity = null;
4210        tagAttributePos = 0;
4211        tagAttributes = new String[100];
4212        rawReadBuffer = new byte[READ_BUFFER_MAX];
4213        readBufferOverflow = -1;
4214
4215        context = CONTEXT_NONE;
4216
4217        symbolTable = new Object[SYMBOL_TABLE_LENGTH];
4218    }
4219
4220    /**
4221     * Clean up after the parse to allow some garbage collection.
4222     * Leave around anything that might be useful for queries.
4223     */
4224    void cleanupVariables() {
4225        //errorCount = -1;
4226        line = -1;
4227        column = -1;
4228        dataBuffer = null;
4229        nameBuffer = null;
4230        currentElement = null;
4231        currentElementContent = CONTENT_UNDECLARED;
4232        sourceType = INPUT_NONE;
4233        inputStack = null;
4234        externalEntity = null;
4235        entityStack = null;
4236    }
4237
4238    //
4239    // The current XML handler interface.
4240    //
4241    XmlHandler handler;
4242
4243    //
4244    // I/O information.
4245    //
4246    private Reader reader; // current reader
4247
4248    private InputStream is; // current input stream
4249
4250    private int line; // current line number
4251
4252    private int column; // current column number
4253
4254    private int sourceType; // type of input source
4255
4256    private Stack inputStack; // stack of input sources
4257
4258    private URLConnection externalEntity; // current external entity
4259
4260    private int encoding; // current character encoding.
4261
4262    private int currentByteCount; // how many bytes read from current source.
4263
4264    //
4265    // Maintain a count of errors.
4266    //
4267    //private int errorCount;
4268
4269    //
4270    // Buffers for decoded but unparsed character input.
4271    //
4272    private final static int READ_BUFFER_MAX = 16384;
4273
4274    private char[] readBuffer;
4275
4276    private int readBufferPos;
4277
4278    private int readBufferLength;
4279
4280    private int readBufferOverflow; // overflow character from last data chunk.
4281
4282    //
4283    // Stack of entity names, to help detect recursion.
4284    //
4285    private Stack entityStack;
4286
4287    //
4288    // Buffer for undecoded raw byte input.
4289    //
4290    private byte[] rawReadBuffer;
4291
4292    //
4293    // Buffer for parsed character data.
4294    //
4295    private static int DATA_BUFFER_INITIAL = 4096;
4296
4297    private char[] dataBuffer;
4298
4299    private int dataBufferPos;
4300
4301    //
4302    // Buffer for parsed names.
4303    //
4304    private static int NAME_BUFFER_INITIAL = 1024;
4305
4306    private char[] nameBuffer;
4307
4308    private int nameBufferPos;
4309
4310    //
4311    // Hashtables for DTD information on elements, entities, and notations.
4312    //
4313    private Hashtable elementInfo;
4314
4315    private Hashtable entityInfo;
4316
4317    private Hashtable notationInfo;
4318
4319    //
4320    // Element type currently in force.
4321    //
4322    private String currentElement;
4323
4324    private int currentElementContent;
4325
4326    //
4327    // Base external identifiers for resolution.
4328    //
4329    private String basePublicId;
4330
4331    private String baseURI;
4332
4333    private Reader baseReader;
4334
4335    private InputStream baseInputStream;
4336
4337    //
4338    // Are we in a context where PEs are allowed?
4339    //
4340    private int context;
4341
4342    //
4343    // Symbol table, for internalising names.
4344    //
4345    private Object[] symbolTable;
4346
4347    private final static int SYMBOL_TABLE_LENGTH = 1087;
4348
4349    //
4350    // Hash table of attributes found in current start tag.
4351    //
4352    private String[] tagAttributes;
4353
4354    private int tagAttributePos;
4355
4356    //
4357    // Utility flag: have we noticed a CR while reading the last
4358    // data chunk?  If so, we will have to go back and normalise
4359    // CR/LF.
4360    //
4361    private boolean sawCR;
4362}