View Javadoc
1   /* Woodstox XML processor
2    *
3    * Copyright (c) 2004- Tatu Saloranta, tatu.saloranta@iki.fi
4    *
5    * Licensed under the License specified in file LICENSE, included with
6    * the source code.
7    * You may not use this file except in compliance with the License.
8    *
9    * Unless required by applicable law or agreed to in writing, software
10   * distributed under the License is distributed on an "AS IS" BASIS,
11   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12   * See the License for the specific language governing permissions and
13   * limitations under the License.
14   */
15  
16  package com.ctc.wstx.sr;
17  
18  import java.io.FileNotFoundException;
19  import java.io.IOException;
20  import java.net.URL;
21  import java.text.MessageFormat;
22  import java.util.Collections;
23  import java.util.HashMap;
24  import java.util.Map;
25  
26  import javax.xml.stream.Location;
27  import javax.xml.stream.XMLInputFactory;
28  import javax.xml.stream.XMLReporter;
29  import javax.xml.stream.XMLResolver;
30  import javax.xml.stream.XMLStreamException;
31  
32  import org.codehaus.stax2.XMLReporter2;
33  import org.codehaus.stax2.XMLStreamLocation2;
34  import org.codehaus.stax2.validation.XMLValidationProblem;
35  
36  import com.ctc.wstx.api.ReaderConfig;
37  import com.ctc.wstx.cfg.ErrorConsts;
38  import com.ctc.wstx.cfg.InputConfigFlags;
39  import com.ctc.wstx.cfg.ParsingErrorMsgs;
40  import com.ctc.wstx.cfg.XmlConsts;
41  import com.ctc.wstx.dtd.MinimalDTDReader;
42  import com.ctc.wstx.ent.EntityDecl;
43  import com.ctc.wstx.ent.IntEntity;
44  import com.ctc.wstx.exc.*;
45  import com.ctc.wstx.io.DefaultInputResolver;
46  import com.ctc.wstx.io.WstxInputData;
47  import com.ctc.wstx.io.WstxInputLocation;
48  import com.ctc.wstx.io.WstxInputSource;
49  import com.ctc.wstx.util.ExceptionUtil;
50  import com.ctc.wstx.util.SymbolTable;
51  import com.ctc.wstx.util.TextBuffer;
52  
53  /**
54   * Abstract base class that defines some basic functionality that all
55   * Woodstox reader classes (main XML reader, DTD reader) extend from.
56   */
57  public abstract class StreamScanner
58          extends WstxInputData
59          implements InputProblemReporter,
60          InputConfigFlags, ParsingErrorMsgs
61  {
62  
63      // // // Some well-known chars:
64  
65      /**
66       * Last (highest) char code of the three, LF, CR and NULL
67       */
68      public final static char CHAR_CR_LF_OR_NULL = (char) 13;
69  
70      public final static int INT_CR_LF_OR_NULL = 13;
71  
72      /**
73       * Character that allows quick check of whether a char can potentially
74       * be some kind of markup, WRT input stream processing;
75       * has to contain linefeeds, &, < and > (">" only matters when
76       * quoting text, as part of "]]>")
77       */
78      protected final static char CHAR_FIRST_PURE_TEXT = (char) ('>' + 1);
79  
80  
81      /**
82       * First character in Unicode (ie one with lowest id) that is legal
83       * as part of a local name (all valid name chars minus ':'). Used
84       * for doing quick check for local name end; usually name ends in
85       * a whitespace or equals sign.
86       */
87      protected final static char CHAR_LOWEST_LEGAL_LOCALNAME_CHAR = '-';
88  
89      /*
90      ///////////////////////////////////////////////////////////////////////
91      // Character validity constants, structs
92      ///////////////////////////////////////////////////////////////////////
93       */
94  
95      /**
96       * We will only use validity array for first 256 characters, mostly
97       * because after those characters it's easier to do fairly simple
98       * block checks.
99       */
100     private final static int VALID_CHAR_COUNT = 0x100;
101 
102     private final static byte NAME_CHAR_INVALID_B = (byte) 0;
103     private final static byte NAME_CHAR_ALL_VALID_B = (byte) 1;
104     private final static byte NAME_CHAR_VALID_NONFIRST_B = (byte) -1;
105 
106     private final static byte[] sCharValidity = new byte[VALID_CHAR_COUNT];
107 
108     static {
109         // First, since all valid-as-first chars are also valid-as-other chars,
110         // we'll initialize common chars:
111         sCharValidity['_'] = NAME_CHAR_ALL_VALID_B;
112         for (int i = 0, last = ('z' - 'a'); i <= last; ++i) {
113             sCharValidity['A' + i] = NAME_CHAR_ALL_VALID_B;
114             sCharValidity['a' + i] = NAME_CHAR_ALL_VALID_B;
115         }
116         for (int i = 0xC0; i < 0xF6; ++i) { // not all are fully valid, but
117             sCharValidity[i] = NAME_CHAR_ALL_VALID_B;
118         }
119         // ... now we can 'revert' ones not fully valid:
120         sCharValidity[0xD7] = NAME_CHAR_INVALID_B;
121         sCharValidity[0xF7] = NAME_CHAR_INVALID_B;
122 
123         // And then we can proceed with ones only valid-as-other.
124         sCharValidity['-'] = NAME_CHAR_VALID_NONFIRST_B;
125         sCharValidity['.'] = NAME_CHAR_VALID_NONFIRST_B;
126         sCharValidity[0xB7] = NAME_CHAR_VALID_NONFIRST_B;
127         for (int i = '0'; i <= '9'; ++i) {
128             sCharValidity[i] = NAME_CHAR_VALID_NONFIRST_B;
129         }
130     }
131 
132     /**
133      * Public identifiers only use 7-bit ascii range.
134      */
135     private final static int VALID_PUBID_CHAR_COUNT = 0x80;
136     private final static byte[] sPubidValidity = new byte[VALID_PUBID_CHAR_COUNT];
137     //    private final static byte PUBID_CHAR_INVALID_B = (byte) 0;
138     private final static byte PUBID_CHAR_VALID_B = (byte) 1;
139     static {
140         for (int i = 0, last = ('z' - 'a'); i <= last; ++i) {
141             sPubidValidity['A' + i] = PUBID_CHAR_VALID_B;
142             sPubidValidity['a' + i] = PUBID_CHAR_VALID_B;
143         }
144         for (int i = '0'; i <= '9'; ++i) {
145             sPubidValidity[i] = PUBID_CHAR_VALID_B;
146         }
147 
148         // 3 main white space types are valid
149         sPubidValidity[0x0A] = PUBID_CHAR_VALID_B;
150         sPubidValidity[0x0D] = PUBID_CHAR_VALID_B;
151         sPubidValidity[0x20] = PUBID_CHAR_VALID_B;
152 
153         // And many of punctuation/separator ascii chars too:
154         sPubidValidity['-'] = PUBID_CHAR_VALID_B;
155         sPubidValidity['\''] = PUBID_CHAR_VALID_B;
156         sPubidValidity['('] = PUBID_CHAR_VALID_B;
157         sPubidValidity[')'] = PUBID_CHAR_VALID_B;
158         sPubidValidity['+'] = PUBID_CHAR_VALID_B;
159         sPubidValidity[','] = PUBID_CHAR_VALID_B;
160         sPubidValidity['.'] = PUBID_CHAR_VALID_B;
161         sPubidValidity['/'] = PUBID_CHAR_VALID_B;
162         sPubidValidity[':'] = PUBID_CHAR_VALID_B;
163         sPubidValidity['='] = PUBID_CHAR_VALID_B;
164         sPubidValidity['?'] = PUBID_CHAR_VALID_B;
165         sPubidValidity[';'] = PUBID_CHAR_VALID_B;
166         sPubidValidity['!'] = PUBID_CHAR_VALID_B;
167         sPubidValidity['*'] = PUBID_CHAR_VALID_B;
168         sPubidValidity['#'] = PUBID_CHAR_VALID_B;
169         sPubidValidity['@'] = PUBID_CHAR_VALID_B;
170         sPubidValidity['$'] = PUBID_CHAR_VALID_B;
171         sPubidValidity['_'] = PUBID_CHAR_VALID_B;
172         sPubidValidity['%'] = PUBID_CHAR_VALID_B;
173     }
174 
175     /*
176     ///////////////////////////////////////////////////////////////////////
177     // Basic configuration
178     ///////////////////////////////////////////////////////////////////////
179      */
180 
181     /**
182      * Copy of the configuration object passed by the factory.
183      * Contains immutable settings for this reader (or in case
184      * of DTD parsers, reader that uses it)
185      */
186     protected final ReaderConfig mConfig;
187 
188     // // // Various extracted settings:
189 
190     /**
191      * If true, Reader is namespace aware, and should do basic checks
192      * (usually enforcing limitations on having colons in names)
193      */
194     protected final boolean mCfgNsEnabled;
195 
196     // Extracted standard on/off settings:
197 
198     /**
199      * note: left non-final on purpose: sub-class may need to modify
200      * the default value after construction.
201      */
202     protected boolean mCfgReplaceEntities;
203 
204     /*
205     ///////////////////////////////////////////////////////////////////////
206     // Symbol handling, if applicable
207     ///////////////////////////////////////////////////////////////////////
208      */
209 
210     final SymbolTable mSymbols;
211 
212     /**
213      * Local full name for the event, if it has one (note: element events
214      * do NOT use this variable; those names are stored in element stack):
215      * target for processing instructions.
216      *<p>
217      * Currently used for proc. instr. target, and entity name (at least
218      * when current entity reference is null).
219      *<p>
220      * Note: this variable is generally not cleared, since it comes from
221      * a symbol table, ie. this won't be the only reference.
222      */
223     protected String mCurrName;
224 
225     /*
226     ///////////////////////////////////////////////////////////////////////
227     // Input handling
228     ///////////////////////////////////////////////////////////////////////
229      */
230 
231     /**
232      * Currently active input source; contains link to parent (nesting) input
233      * sources, if any.
234      */
235     protected WstxInputSource mInput;
236 
237     /**
238      * Top-most input source this reader can use; due to input source
239      * chaining, this is not necessarily the root of all input; for example,
240      * external DTD subset reader's root input still has original document
241      * input as its parent.
242      */
243     protected final WstxInputSource mRootInput;
244 
245     /**
246      * Custom resolver used to handle external entities that are to be expanded
247      * by this reader (external param/general entity expander)
248      */
249     protected XMLResolver mEntityResolver = null;
250 
251     /**
252      * This is the current depth of the input stack (same as what input
253      * element stack would return as its depth).
254      * It is used to enforce input scope constraints for nesting of
255      * elements (for xml reader) and dtd declaration (for dtd reader)
256      * with regards to input block (entity expansion) boundaries.
257      *<p>
258      * Basically this value is compared to {@link #mInputTopDepth}, which
259      * indicates what was the depth at the point where the currently active
260      * input scope/block was started.
261      */
262     protected int mCurrDepth;
263 
264     protected int mInputTopDepth;
265 
266     /**
267      * Number of times a parsed general entity has been expanded; used for
268      * (optionally) limiting number of expansion to guard against
269      * denial-of-service attacks like "Billion Laughs".
270      *
271      * @since 4.3
272      */
273     protected int mEntityExpansionCount;
274 
275     /**
276      * Flag that indicates whether linefeeds in the input data are to
277      * be normalized or not.
278      * Xml specs mandate that the line feeds are only normalized
279      * when they are from the external entities (main doc, external
280      * general/parsed entities), so normalization has to be
281      * suppressed when expanding internal general/parsed entities.
282      */
283     protected boolean mNormalizeLFs;
284 
285     /**
286      * Flag that indicates whether all escaped chars are accepted in XML 1.0.
287      */
288     protected boolean mXml10AllowAllEscapedChars;
289 
290     /*
291     ///////////////////////////////////////////////////////////////////////
292     // Buffer(s) for local name(s) and text content
293     ///////////////////////////////////////////////////////////////////////
294      */
295 
296     /**
297      * Temporary buffer used if local name can not be just directly
298      * constructed from input buffer (name is on a boundary or such).
299      */
300     protected char[] mNameBuffer = null;
301 
302     /*
303     ///////////////////////////////////////////////////////////////////////
304     // Information about starting location of event
305     // Reader is pointing to; updated on-demand
306     ///////////////////////////////////////////////////////////////////////
307      */
308 
309     // // // Location info at point when current token was started
310 
311     /**
312      * Total number of characters read before start of current token.
313      * For big (gigabyte-sized) sizes are possible, needs to be long,
314      * unlike pointers and sizes related to in-memory buffers.
315      */
316     protected long mTokenInputTotal = 0;
317 
318     /**
319      * Input row on which current token starts, 1-based
320      */
321     protected int mTokenInputRow = 1;
322 
323     /**
324      * Column on input row that current token starts; 0-based (although
325      * in the end it'll be converted to 1-based)
326      */
327     protected int mTokenInputCol = 0;
328 
329     /*
330     ///////////////////////////////////////////////////////////////////////
331     // XML document information (from doc decl if one was found) common to
332     // all entities (main xml document, external DTD subset)
333     ///////////////////////////////////////////////////////////////////////
334      */
335 
336     /**
337      * Input stream encoding, if known (passed in, or determined by
338      * auto-detection); null if not.
339      */
340     protected String mDocInputEncoding = null;
341 
342     /**
343      * Character encoding from xml declaration, if any; null if no
344      * declaration, or it didn't specify encoding.
345      */
346     protected String mDocXmlEncoding = null;
347 
348     /**
349      * XML version as declared by the document; one of constants
350      * from {@link XmlConsts} (like {@link XmlConsts#XML_V_10}).
351      */
352     protected int mDocXmlVersion = XmlConsts.XML_V_UNKNOWN;
353 
354     /**
355      * Cache of internal character entities;
356      */
357     protected Map<String,IntEntity> mCachedEntities;
358 
359     /**
360      * Flag for whether or not character references should be treated as entities
361      */
362     protected boolean mCfgTreatCharRefsAsEntities;
363 
364     /**
365      * Entity reference stream currently points to.
366      */
367     protected EntityDecl mCurrEntity;
368 
369     /*
370     ///////////////////////////////////////////////////////////////////////
371     // Life-cycle
372     ///////////////////////////////////////////////////////////////////////
373      */
374 
375     /**
376      * Constructor used when creating a complete new (main-level) reader that
377      * does not share its input buffers or state with another reader.
378      */
379     protected StreamScanner(WstxInputSource input, ReaderConfig cfg,
380                             XMLResolver res)
381     {
382         super();
383         mInput = input;
384         // 17-Jun-2004, TSa: Need to know root-level input source
385         mRootInput = input;
386 
387         mConfig = cfg;
388         mSymbols = cfg.getSymbols();
389         int cf = cfg.getConfigFlags();
390         mCfgNsEnabled = (cf & CFG_NAMESPACE_AWARE) != 0;
391         mCfgReplaceEntities = (cf & CFG_REPLACE_ENTITY_REFS) != 0;
392 
393         // waiting for pull request, see https://github.com/FasterXML/woodstox/pull/56
394         mXml10AllowAllEscapedChars = true;//mConfig.willXml10AllowAllEscapedChars();
395 
396         mNormalizeLFs = mConfig.willNormalizeLFs();
397         mInputBuffer = null;
398         mInputPtr = mInputEnd = 0;
399         mEntityResolver = res;
400 
401         mCfgTreatCharRefsAsEntities = mConfig.willTreatCharRefsAsEnts();
402         if (mCfgTreatCharRefsAsEntities) {
403             mCachedEntities = new HashMap<String,IntEntity>();
404         } else {
405             mCachedEntities = Collections.emptyMap();
406         }
407     }
408 
409     /*
410     ///////////////////////////////////////////////////////////////////////
411     // Package API
412     ///////////////////////////////////////////////////////////////////////
413      */
414 
415     /**
416      * Method that returns location of the last character returned by this
417      * reader; that is, location "one less" than the currently pointed to
418      * location.
419      */
420     protected WstxInputLocation getLastCharLocation()
421     {
422         return mInput.getLocation(mCurrInputProcessed + mInputPtr - 1,
423                 mCurrInputRow, mInputPtr - mCurrInputRowStart);
424     }
425 
426     protected URL getSource() throws IOException {
427         return mInput.getSource();
428     }
429 
430     protected String getSystemId() {
431         return mInput.getSystemId();
432     }
433 
434     /*
435     ///////////////////////////////////////////////////////////////////////
436     // Partial `LocationInfo` implementation (not implemented
437     // by this base class, but is by some sub-classes)
438     ///////////////////////////////////////////////////////////////////////
439      */
440 
441     /**
442      * Returns location of last properly parsed token; as per StAX specs,
443      * apparently needs to be the end of current event, which is the same
444      * as the start of the following event (or EOF if that's next).
445      */
446     @Override
447     public abstract Location getLocation();
448 
449     public XMLStreamLocation2 getStartLocation()
450     {
451         // note: +1 is used as columns are 1-based...
452         return mInput.getLocation(mTokenInputTotal,
453                 mTokenInputRow, mTokenInputCol + 1);
454     }
455 
456     public XMLStreamLocation2 getCurrentLocation()
457     {
458         return mInput.getLocation(mCurrInputProcessed + mInputPtr,
459                 mCurrInputRow, mInputPtr - mCurrInputRowStart + 1);
460     }
461 
462     /*
463     ///////////////////////////////////////////////////////////////////////
464     // InputProblemReporter implementation
465     ///////////////////////////////////////////////////////////////////////
466      */
467 
468     public WstxException throwWfcException(String msg, boolean deferErrors)
469             throws WstxException
470     {
471         WstxException ex = constructWfcException(msg);
472         if (!deferErrors) {
473             throw ex;
474         }
475         return ex;
476     }
477 
478     @Override
479     public void throwParseError(String msg) throws XMLStreamException {
480         throwParseError(msg, null, null);
481     }
482 
483     /**
484      * Throws generic parse error with specified message and current parsing
485      * location.
486      *<p>
487      * Note: public access only because core code in other packages needs
488      * to access it.
489      */
490     @Override
491     public void throwParseError(String format, Object arg, Object arg2)
492             throws XMLStreamException
493     {
494         String msg = (arg != null || arg2 != null) ?
495                 MessageFormat.format(format, new Object[] { arg, arg2 }) : format;
496         throw constructWfcException(msg);
497     }
498 
499     public void reportProblem(String probType, String format, Object arg, Object arg2)
500             throws XMLStreamException
501     {
502         XMLReporter rep = mConfig.getXMLReporter();
503         if (rep != null) {
504             _reportProblem(rep, probType,
505                     MessageFormat.format(format, new Object[] { arg, arg2 }), null);
506         }
507     }
508 
509     @Override
510     public void reportProblem(Location loc, String probType,
511                               String format, Object arg, Object arg2)
512             throws XMLStreamException
513     {
514         XMLReporter rep = mConfig.getXMLReporter();
515         if (rep != null) {
516             String msg = (arg != null || arg2 != null) ?
517                     MessageFormat.format(format, new Object[] { arg, arg2 }) : format;
518             _reportProblem(rep, probType, msg, loc);
519         }
520     }
521 
522     protected void _reportProblem(XMLReporter rep, String probType, String msg, Location loc)
523             throws XMLStreamException
524     {
525         if (loc == null) {
526             loc = getLastCharLocation();
527         }
528         _reportProblem(rep, new XMLValidationProblem(loc, msg, XMLValidationProblem.SEVERITY_ERROR, probType));
529     }
530 
531     protected void _reportProblem(XMLReporter rep, XMLValidationProblem prob)
532             throws XMLStreamException
533     {
534         if (rep != null) {
535             Location loc = prob.getLocation();
536             if (loc == null) {
537                 loc = getLastCharLocation();
538                 prob.setLocation(loc);
539             }
540             // Backwards-compatibility fix: add non-null type, if missing:
541             if (prob.getType() == null) {
542                 prob.setType(ErrorConsts.WT_VALIDATION);
543             }
544             // [WSTX-154]: was catching and dropping thrown exception: shouldn't.
545             // [WTSX-157]: need to support XMLReporter2
546             if (rep instanceof XMLReporter2) {
547                 ((XMLReporter2) rep).report(prob);
548             } else {
549                 rep.report(prob.getMessage(), prob.getType(), prob, loc);
550             }
551         }
552     }
553 
554     /**
555      *<p>
556      * Note: this is the base implementation used for implementing
557      * <code>ValidationContext</code>
558      */
559     @Override
560     public void reportValidationProblem(XMLValidationProblem prob)
561             throws XMLStreamException
562     {
563         // !!! TBI: Fail-fast vs. deferred modes?
564         /* For now let's implement basic functionality: warnings get
565          * reported via XMLReporter, errors and fatal errors result in
566          * immediate exceptions.
567          */
568         /* 27-May-2008, TSa: [WSTX-153] Above is incorrect: as per Stax
569          *   javadocs for XMLReporter, both warnings and non-fatal errors
570          *   (which includes all validation errors) should be reported via
571          *   XMLReporter interface, and only fatals should cause an
572          *   immediate stream exception (by-passing reporter)
573          */
574         if (prob.getSeverity() > XMLValidationProblem.SEVERITY_ERROR) {
575             throw WstxValidationException.create(prob);
576         }
577         XMLReporter rep = mConfig.getXMLReporter();
578         if (rep != null) {
579             _reportProblem(rep, prob);
580         } else {
581             /* If no reporter, regular non-fatal errors are to be reported
582              * as exceptions as well, for backwards compatibility
583              */
584             if (prob.getSeverity() >= XMLValidationProblem.SEVERITY_ERROR) {
585                 throw WstxValidationException.create(prob);
586             }
587         }
588     }
589 
590     public void reportValidationProblem(String msg, int severity)
591             throws XMLStreamException
592     {
593         reportValidationProblem(new XMLValidationProblem(getLastCharLocation(),
594                 msg, severity));
595     }
596 
597     @Override
598     public void reportValidationProblem(String msg)
599             throws XMLStreamException
600     {
601         reportValidationProblem(new XMLValidationProblem(getLastCharLocation(), msg,
602                 XMLValidationProblem.SEVERITY_ERROR));
603     }
604 
605     public void reportValidationProblem(Location loc, String msg)
606             throws XMLStreamException
607     {
608         reportValidationProblem(new XMLValidationProblem(loc, msg));
609     }
610 
611     @Override
612     public void reportValidationProblem(String format, Object arg, Object arg2)
613             throws XMLStreamException
614     {
615         reportValidationProblem(MessageFormat.format(format, new Object[] { arg, arg2 }));
616     }
617 
618     /*
619     ///////////////////////////////////////////////////////////////////////
620     // Other error reporting methods
621     ///////////////////////////////////////////////////////////////////////
622      */
623 
624     protected WstxException constructWfcException(String msg)
625     {
626         return new WstxParsingException(msg, getLastCharLocation());
627     }
628 
629     /**
630      * Construct and return a {@link XMLStreamException} to throw
631      * as a result of a failed Typed Access operation (but one not
632      * caused by a Well-Formedness Constraint or Validation Constraint
633      * problem)
634      */
635     /*
636     protected WstxException _constructTypeException(String msg)
637     {
638         // Hmmh. Should there be a distinct sub-type?
639         return new WstxParsingException(msg, getLastCharLocation());
640     }
641     */
642 
643     protected WstxException constructFromIOE(IOException ioe)
644     {
645         return new WstxIOException(ioe);
646     }
647 
648     protected WstxException constructNullCharException()
649     {
650         return new WstxUnexpectedCharException("Illegal character (NULL, unicode 0) encountered: not valid in any content",
651                 getLastCharLocation(), CHAR_NULL);
652     }
653 
654     protected void throwUnexpectedChar(int i, String msg) throws WstxException
655     {
656         char c = (char) i;
657         String excMsg = "Unexpected character "+getCharDesc(c)+msg;
658         throw new WstxUnexpectedCharException(excMsg, getLastCharLocation(), c);
659     }
660 
661     protected void throwNullChar() throws WstxException {
662         throw constructNullCharException();
663     }
664 
665     protected void throwInvalidSpace(int i) throws WstxException {
666         throwInvalidSpace(i, false);
667     }
668 
669     protected WstxException throwInvalidSpace(int i, boolean deferErrors)
670             throws WstxException
671     {
672         char c = (char) i;
673         WstxException ex;
674         if (c == CHAR_NULL) {
675             ex = constructNullCharException();
676         } else {
677             String msg = "Illegal character ("+getCharDesc(c)+")";
678             if (mXml11) {
679                 msg += " [note: in XML 1.1, it could be included via entity expansion]";
680             }
681             ex = new WstxUnexpectedCharException(msg, getLastCharLocation(), c);
682         }
683         if (!deferErrors) {
684             throw ex;
685         }
686         return ex;
687     }
688 
689     protected void throwUnexpectedEOF(String msg)
690             throws WstxException
691     {
692         throw new WstxEOFException("Unexpected EOF"+(msg == null ? "" : msg),
693                 getLastCharLocation());
694     }
695 
696     /**
697      * Similar to {@link #throwUnexpectedEOF}, but only indicates ending
698      * of an input block. Used when reading a token that can not span
699      * input block boundaries (ie. can not continue past end of an
700      * entity expansion).
701      */
702     protected void throwUnexpectedEOB(String msg)
703             throws WstxException
704     {
705         throw new WstxEOFException("Unexpected end of input block"+(msg == null ? "" : msg),
706                 getLastCharLocation());
707     }
708 
709     protected void throwFromIOE(IOException ioe) throws WstxException {
710         throw new WstxIOException(ioe);
711     }
712 
713     protected void throwFromStrE(XMLStreamException strex)
714             throws WstxException
715     {
716         if (strex instanceof WstxException) {
717             throw (WstxException) strex;
718         }
719         throw new WstxException(strex);
720     }
721 
722     /**
723      * Method called to report an error, when caller's signature only
724      * allows runtime exceptions to be thrown.
725      */
726     protected void throwLazyError(Exception e)
727     {
728         if (e instanceof XMLStreamException) {
729             WstxLazyException.throwLazily((XMLStreamException) e);
730         }
731         ExceptionUtil.throwRuntimeException(e);
732     }
733 
734     protected String tokenTypeDesc(int type) {
735         return ErrorConsts.tokenTypeDesc(type);
736     }
737 
738     /*
739     ///////////////////////////////////////////////////////////////////////
740     // Input buffer handling
741     ///////////////////////////////////////////////////////////////////////
742      */
743 
744     /**
745      * Returns current input source this source uses.
746      *<p>
747      * Note: public only because some implementations are on different
748      * package.
749      */
750     public final WstxInputSource getCurrentInput() {
751         return mInput;
752     }
753 
754     protected final int inputInBuffer() {
755         return mInputEnd - mInputPtr;
756     }
757 
758     @SuppressWarnings("cast")
759     protected final int getNext() throws XMLStreamException
760     {
761         if (mInputPtr >= mInputEnd) {
762             if (!loadMore()) {
763                 return -1;
764             }
765         }
766         return (int) mInputBuffer[mInputPtr++];
767     }
768 
769     /**
770      * Similar to {@link #getNext}, but does not advance pointer
771      * in input buffer.
772      *<p>
773      * Note: this method only peeks within current input source;
774      * it does not close it and check nested input source (if any).
775      * This is necessary when checking keywords, since they can never
776      * cross input block boundary.
777      */
778     @SuppressWarnings("cast")
779     protected final int peekNext()
780             throws XMLStreamException
781     {
782         if (mInputPtr >= mInputEnd) {
783             if (!loadMoreFromCurrent()) {
784                 return -1;
785             }
786         }
787         return (int) mInputBuffer[mInputPtr];
788     }
789 
790     protected final char getNextChar(String errorMsg)
791             throws XMLStreamException
792     {
793         if (mInputPtr >= mInputEnd) {
794             loadMore(errorMsg);
795         }
796         return mInputBuffer[mInputPtr++];
797     }
798 
799     /**
800      * Similar to {@link #getNextChar}, but will not read more characters
801      * from parent input source(s) if the current input source doesn't
802      * have more content. This is often needed to prevent "runaway" content,
803      * such as comments that start in an entity but do not have matching
804      * close marker inside entity; XML specification specifically states
805      * such markup is not legal.
806      */
807     protected final char getNextCharFromCurrent(String errorMsg)
808             throws XMLStreamException
809     {
810         if (mInputPtr >= mInputEnd) {
811             loadMoreFromCurrent(errorMsg);
812         }
813         return mInputBuffer[mInputPtr++];
814     }
815 
816     /**
817      * Method that will skip through zero or more white space characters,
818      * and return either the character following white space, or -1 to
819      * indicate EOF (end of the outermost input source)/
820      */
821     @SuppressWarnings("cast")
822     protected final int getNextAfterWS()
823             throws XMLStreamException
824     {
825         if (mInputPtr >= mInputEnd) {
826             if (!loadMore()) {
827                 return -1;
828             }
829         }
830         char c = mInputBuffer[mInputPtr++];
831         while (c <= CHAR_SPACE) {
832             // Linefeed?
833             if (c == '\n' || c == '\r') {
834                 skipCRLF(c);
835             } else if (c != CHAR_SPACE && c != '\t') {
836                 throwInvalidSpace(c);
837             }
838             // Still a white space?
839             if (mInputPtr >= mInputEnd) {
840                 if (!loadMore()) {
841                     return -1;
842                 }
843             }
844             c = mInputBuffer[mInputPtr++];
845         }
846         return (int) c;
847     }
848 
849     protected final char getNextCharAfterWS(String errorMsg)
850             throws XMLStreamException
851     {
852         if (mInputPtr >= mInputEnd) {
853             loadMore(errorMsg);
854         }
855 
856         char c = mInputBuffer[mInputPtr++];
857         while (c <= CHAR_SPACE) {
858             // Linefeed?
859             if (c == '\n' || c == '\r') {
860                 skipCRLF(c);
861             } else if (c != CHAR_SPACE && c != '\t') {
862                 throwInvalidSpace(c);
863             }
864 
865             // Still a white space?
866             if (mInputPtr >= mInputEnd) {
867                 loadMore(errorMsg);
868             }
869             c = mInputBuffer[mInputPtr++];
870         }
871         return c;
872     }
873 
874     protected final char getNextInCurrAfterWS(String errorMsg)
875             throws XMLStreamException
876     {
877         return getNextInCurrAfterWS(errorMsg, getNextCharFromCurrent(errorMsg));
878     }
879 
880     protected final char getNextInCurrAfterWS(String errorMsg, char c)
881             throws XMLStreamException
882     {
883         while (c <= CHAR_SPACE) {
884             // Linefeed?
885             if (c == '\n' || c == '\r') {
886                 skipCRLF(c);
887             } else if (c != CHAR_SPACE && c != '\t') {
888                 throwInvalidSpace(c);
889             }
890 
891             // Still a white space?
892             if (mInputPtr >= mInputEnd) {
893                 loadMoreFromCurrent(errorMsg);
894             }
895             c = mInputBuffer[mInputPtr++];
896         }
897         return c;
898     }
899 
900     /**
901      * Method called when a CR has been spotted in input; checks if next
902      * char is LF, and if so, skips it. Note that next character has to
903      * come from the current input source, to qualify; it can never come
904      * from another (nested) input source.
905      *
906      * @return True, if passed in char is '\r' and next one is '\n'.
907      */
908     protected final boolean skipCRLF(char c)
909             throws XMLStreamException
910     {
911         boolean result;
912 
913         if (c == '\r' && peekNext() == '\n') {
914             ++mInputPtr;
915             result = true;
916         } else {
917             result = false;
918         }
919         ++mCurrInputRow;
920         mCurrInputRowStart = mInputPtr;
921         return result;
922     }
923 
924     protected final void markLF() {
925         ++mCurrInputRow;
926         mCurrInputRowStart = mInputPtr;
927     }
928 
929     protected final void markLF(int inputPtr) {
930         ++mCurrInputRow;
931         mCurrInputRowStart = inputPtr;
932     }
933 
934     /**
935      * Method to push back last character read; can only be called once,
936      * that is, no more than one char can be guaranteed to be succesfully
937      * returned.
938      */
939     protected final void pushback() { --mInputPtr; }
940 
941     /*
942     ///////////////////////////////////////////////////////////////////////
943     // Sub-class overridable input handling methods
944     ///////////////////////////////////////////////////////////////////////
945      */
946 
947     /**
948      * Method called when an entity has been expanded (new input source
949      * has been created). Needs to initialize location information and change
950      * active input source.
951      *
952      * @param entityId Name of the entity being expanded
953      */
954     protected void initInputSource(WstxInputSource newInput, boolean isExt,
955                                    String entityId)
956             throws XMLStreamException
957     {
958         // Let's make sure new input will be read next time input is needed:
959         mInputPtr = 0;
960         mInputEnd = 0;
961         /* Plus, reset the input location so that'll be accurate for
962          * error reporting etc.
963          */
964         mInputTopDepth = mCurrDepth;
965 
966         // [WSTX-296]: Check for entity expansion depth against configurable limit
967         int entityDepth = mInput.getEntityDepth() + 1;
968         verifyLimit("Maximum entity expansion depth", mConfig.getMaxEntityDepth(), entityDepth);
969         mInput = newInput;
970         mInput.initInputLocation(this, mCurrDepth, entityDepth);
971 
972         /* 21-Feb-2006, TSa: Linefeeds are NOT normalized when expanding
973          *   internal entities (XML, 2.11)
974          */
975         if (isExt) {
976             mNormalizeLFs = true;
977         } else {
978             mNormalizeLFs = false;
979         }
980     }
981 
982     /**
983      * Method that will try to read one or more characters from currently
984      * open input sources; closing input sources if necessary.
985      *
986      * @return true if reading succeeded (or may succeed), false if
987      *   we reached EOF.
988      */
989     protected boolean loadMore()
990             throws XMLStreamException
991     {
992         WstxInputSource input = mInput;
993         do {
994             /* Need to make sure offsets are properly updated for error
995              * reporting purposes, and do this now while previous amounts
996              * are still known.
997              */
998             mCurrInputProcessed += mInputEnd;
999             verifyLimit("Maximum document characters", mConfig.getMaxCharacters(), mCurrInputProcessed);
1000             mCurrInputRowStart -= mInputEnd;
1001             int count;
1002             try {
1003                 count = input.readInto(this);
1004                 if (count > 0) {
1005                     return true;
1006                 }
1007                 input.close();
1008             } catch (IOException ioe) {
1009                 throw constructFromIOE(ioe);
1010             }
1011             if (input == mRootInput) {
1012                 /* Note: no need to check entity/input nesting in this
1013                  * particular case, since it will be handled by higher level
1014                  * parsing code (results in an unexpected EOF)
1015                  */
1016                 return false;
1017             }
1018             WstxInputSource parent = input.getParent();
1019             if (parent == null) { // sanity check!
1020                 throwNullParent(input);
1021             }
1022             /* 13-Feb-2006, TSa: Ok, do we violate a proper nesting constraints
1023              *   with this input block closure?
1024              */
1025             if (mCurrDepth != input.getScopeId()) {
1026                 handleIncompleteEntityProblem(input);
1027             }
1028 
1029             mInput = input = parent;
1030             input.restoreContext(this);
1031             mInputTopDepth = input.getScopeId();
1032             /* 21-Feb-2006, TSa: Since linefeed normalization needs to be
1033              *   suppressed for internal entity expansion, we may need to
1034              *   change the state...
1035              */
1036             if (!mNormalizeLFs) {
1037                 mNormalizeLFs = !input.fromInternalEntity();
1038             }
1039             // Maybe there are leftovers from that input in buffer now?
1040         } while (mInputPtr >= mInputEnd);
1041 
1042         return true;
1043     }
1044 
1045     protected final boolean loadMore(String errorMsg)
1046             throws XMLStreamException
1047     {
1048         if (!loadMore()) {
1049             throwUnexpectedEOF(errorMsg);
1050         }
1051         return true;
1052     }
1053 
1054     protected boolean loadMoreFromCurrent()
1055             throws XMLStreamException
1056     {
1057         // Need to update offsets properly
1058         mCurrInputProcessed += mInputEnd;
1059         mCurrInputRowStart -= mInputEnd;
1060         verifyLimit("Maximum document characters", mConfig.getMaxCharacters(), mCurrInputProcessed);
1061         try {
1062             int count = mInput.readInto(this);
1063             return (count > 0);
1064         } catch (IOException ie) {
1065             throw constructFromIOE(ie);
1066         }
1067     }
1068 
1069     protected final boolean loadMoreFromCurrent(String errorMsg)
1070             throws XMLStreamException
1071     {
1072         if (!loadMoreFromCurrent()) {
1073             throwUnexpectedEOB(errorMsg);
1074         }
1075         return true;
1076     }
1077 
1078     /**
1079      * Method called to make sure current main-level input buffer has at
1080      * least specified number of characters available consequtively,
1081      * without having to call {@link #loadMore}. It can only be called
1082      * when input comes from main-level buffer; further, call can shift
1083      * content in input buffer, so caller has to flush any data still
1084      * pending. In short, caller has to know exactly what it's doing. :-)
1085      *<p>
1086      * Note: method does not check for any other input sources than the
1087      * current one -- if current source can not fulfill the request, a
1088      * failure is indicated.
1089      *
1090      * @return true if there's now enough data; false if not (EOF)
1091      */
1092     protected boolean ensureInput(int minAmount)
1093             throws XMLStreamException
1094     {
1095         int currAmount = mInputEnd - mInputPtr;
1096         if (currAmount >= minAmount) {
1097             return true;
1098         }
1099         try {
1100             return mInput.readMore(this, minAmount);
1101         } catch (IOException ie) {
1102             throw constructFromIOE(ie);
1103         }
1104     }
1105 
1106     protected void closeAllInput(boolean force)
1107             throws XMLStreamException
1108     {
1109         WstxInputSource input = mInput;
1110         while (true) {
1111             try {
1112                 if (force) {
1113                     input.closeCompletely();
1114                 } else {
1115                     input.close();
1116                 }
1117             } catch (IOException ie) {
1118                 throw constructFromIOE(ie);
1119             }
1120             if (input == mRootInput) {
1121                 break;
1122             }
1123             WstxInputSource parent = input.getParent();
1124             if (parent == null) { // sanity check!
1125                 throwNullParent(input);
1126             }
1127             mInput = input = parent;
1128         }
1129     }
1130 
1131     /**
1132      * @param curr Input source currently in use
1133      */
1134     protected void throwNullParent(WstxInputSource curr)
1135     {
1136         throw new IllegalStateException(ErrorConsts.ERR_INTERNAL);
1137         //throw new IllegalStateException("Internal error: null parent for input source '"+curr+"'; should never occur (should have stopped at root input '"+mRootInput+"').");
1138     }
1139 
1140     /*
1141     ///////////////////////////////////////////////////////////////////////
1142     // Entity resolution
1143     ///////////////////////////////////////////////////////////////////////
1144      */
1145 
1146     /**
1147      * Method that tries to resolve a character entity, or (if caller so
1148      * specifies), a pre-defined internal entity (lt, gt, amp, apos, quot).
1149      * It will succeed iff:
1150      * <ol>
1151      *  <li>Entity in question is a simple character entity (either one of
1152      *    5 pre-defined ones, or using decimal/hex notation), AND
1153      *   <li>
1154      *  <li>Entity fits completely inside current input buffer.
1155      *   <li>
1156      * </ol>
1157      * If so, character value of entity is returned. Character 0 is returned
1158      * otherwise; if so, caller needs to do full resolution.
1159      *<p>
1160      * Note: On entry we are guaranteed there are at least 3 more characters
1161      * in this buffer; otherwise we shouldn't be called.
1162      *
1163      * @param checkStd If true, will check pre-defined internal entities
1164      *   (gt, lt, amp, apos, quot); if false, will only check actual
1165      *   character entities.
1166      *
1167      * @return (Valid) character value, if entity is a character reference,
1168      *   and could be resolved from current input buffer (does not span
1169      *   buffer boundary); null char (code 0) if not (either non-char
1170      *   entity, or spans input buffer boundary).
1171      */
1172     protected int resolveSimpleEntity(boolean checkStd)
1173             throws XMLStreamException
1174     {
1175         char[] buf = mInputBuffer;
1176         int ptr = mInputPtr;
1177         char c = buf[ptr++];
1178 
1179         // Numeric reference?
1180         if (c == '#') {
1181             c = buf[ptr++];
1182             int value = 0;
1183             int inputLen = mInputEnd;
1184             if (c == 'x') { // hex
1185                 while (ptr < inputLen) {
1186                     c = buf[ptr++];
1187                     if (c == ';') {
1188                         break;
1189                     }
1190                     value = value << 4;
1191                     if (c <= '9' && c >= '0') {
1192                         value += (c - '0');
1193                     } else if (c >= 'a' && c <= 'f') {
1194                         value += (10 + (c - 'a'));
1195                     } else if (c >= 'A' && c <= 'F') {
1196                         value += (10 + (c - 'A'));
1197                     } else {
1198                         mInputPtr = ptr; // so error points to correct char
1199                         throwUnexpectedChar(c, "; expected a hex digit (0-9a-fA-F).");
1200                     }
1201                     /* Need to check for overflow; easiest to do right as
1202                      * it happens...
1203                      */
1204                     if (value > MAX_UNICODE_CHAR) {
1205                         reportUnicodeOverflow();
1206                     }
1207                 }
1208             } else { // numeric (decimal)
1209                 while (c != ';') {
1210                     if (c <= '9' && c >= '0') {
1211                         value = (value * 10) + (c - '0');
1212                         // Overflow?
1213                         if (value > MAX_UNICODE_CHAR) {
1214                             reportUnicodeOverflow();
1215                         }
1216                     } else {
1217                         mInputPtr = ptr; // so error points to correct char
1218                         throwUnexpectedChar(c, "; expected a decimal number.");
1219                     }
1220                     if (ptr >= inputLen) {
1221                         break;
1222                     }
1223                     c = buf[ptr++];
1224                 }
1225             }
1226             /* We get here either if we got it all, OR if we ran out of
1227              * input in current buffer.
1228              */
1229             if (c == ';') { // got the full thing
1230                 mInputPtr = ptr;
1231                 validateChar(value);
1232                 return value;
1233             }
1234 
1235             /* If we ran out of input, need to just fall back, gets
1236              * resolved via 'full' resolution mechanism.
1237              */
1238         } else if (checkStd) {
1239             /* Caller may not want to resolve these quite yet...
1240              * (when it wants separate events for non-char entities)
1241              */
1242             if (c == 'a') { // amp or apos?
1243                 c = buf[ptr++];
1244 
1245                 if (c == 'm') { // amp?
1246                     if (buf[ptr++] == 'p') {
1247                         if (ptr < mInputEnd && buf[ptr++] == ';') {
1248                             mInputPtr = ptr;
1249                             return '&';
1250                         }
1251                     }
1252                 } else if (c == 'p') { // apos?
1253                     if (buf[ptr++] == 'o') {
1254                         int len = mInputEnd;
1255                         if (ptr < len && buf[ptr++] == 's') {
1256                             if (ptr < len && buf[ptr++] == ';') {
1257                                 mInputPtr = ptr;
1258                                 return '\'';
1259                             }
1260                         }
1261                     }
1262                 }
1263             } else if (c == 'g') { // gt?
1264                 if (buf[ptr++] == 't' && buf[ptr++] == ';') {
1265                     mInputPtr = ptr;
1266                     return '>';
1267                 }
1268             } else if (c == 'l') { // lt?
1269                 if (buf[ptr++] == 't' && buf[ptr++] == ';') {
1270                     mInputPtr = ptr;
1271                     return '<';
1272                 }
1273             } else if (c == 'q') { // quot?
1274                 if (buf[ptr++] == 'u' && buf[ptr++] == 'o') {
1275                     int len = mInputEnd;
1276                     if (ptr < len && buf[ptr++] == 't') {
1277                         if (ptr < len && buf[ptr++] == ';') {
1278                             mInputPtr = ptr;
1279                             return '"';
1280                         }
1281                     }
1282                 }
1283             }
1284         }
1285         return 0;
1286     }
1287 
1288     /**
1289      * Method called to resolve character entities, and only character
1290      * entities (except that pre-defined char entities -- amp, apos, lt,
1291      * gt, quote -- MAY be "char entities" in this sense, depending on
1292      * arguments).
1293      * Otherwise it is to return the null char; if so,
1294      * the input pointer will point to the same point as when method
1295      * entered (char after ampersand), plus the ampersand itself is
1296      * guaranteed to be in the input buffer (so caller can just push it
1297      * back if necessary).
1298      *<p>
1299      * Most often this method is called when reader is not to expand
1300      * non-char entities automatically, but to return them as separate
1301      * events.
1302      *<p>
1303      * Main complication here is that we need to do 5-char lookahead. This
1304      * is problematic if chars are on input buffer boundary. This is ok
1305      * for the root level input buffer, but not for some nested buffers.
1306      * However, according to XML specs, such split entities are actually
1307      * illegal... so we can throw an exception in those cases.
1308      *
1309      * @param checkStd If true, will check pre-defined internal entities
1310      *   (gt, lt, amp, apos, quot) as character entities; if false, will only
1311      *   check actual 'real' character entities.
1312      *
1313      * @return (Valid) character value, if entity is a character reference,
1314      *   and could be resolved from current input buffer (does not span
1315      *   buffer boundary); null char (code 0) if not (either non-char
1316      *   entity, or spans input buffer boundary).
1317      */
1318     protected int resolveCharOnlyEntity(boolean checkStd)
1319             throws XMLStreamException
1320     {
1321         //int avail = inputInBuffer();
1322         int avail = mInputEnd - mInputPtr;
1323         if (avail < 6) {
1324             // split entity, or buffer boundary
1325             /* Don't want to lose leading '&' (in case we can not expand
1326              * the entity), so let's push it back first
1327              */
1328             --mInputPtr;
1329             /* Shortest valid reference would be 3 chars ('&a;'); which
1330              * would only be legal from an expanded entity...
1331              */
1332             if (!ensureInput(6)) {
1333                 avail = inputInBuffer();
1334                 if (avail < 3) {
1335                     throwUnexpectedEOF(SUFFIX_IN_ENTITY_REF);
1336                 }
1337             } else {
1338                 avail = 6;
1339             }
1340             // ... and now we can move pointer back as well:
1341             ++mInputPtr;
1342         }
1343 
1344         /* Ok, now we have one more character to check, and that's enough
1345          * to determine type decisively.
1346          */
1347         char c = mInputBuffer[mInputPtr];
1348 
1349         // A char reference?
1350         if (c == '#') { // yup
1351             ++mInputPtr;
1352             return resolveCharEnt(null);
1353         }
1354 
1355         // nope... except may be a pre-def?
1356         if (checkStd) {
1357             if (c == 'a') {
1358                 char d = mInputBuffer[mInputPtr+1];
1359                 if (d == 'm') {
1360                     if (avail >= 4
1361                             && mInputBuffer[mInputPtr+2] == 'p'
1362                             && mInputBuffer[mInputPtr+3] == ';') {
1363                         mInputPtr += 4;
1364                         return '&';
1365                     }
1366                 } else if (d == 'p') {
1367                     if (avail >= 5
1368                             && mInputBuffer[mInputPtr+2] == 'o'
1369                             && mInputBuffer[mInputPtr+3] == 's'
1370                             && mInputBuffer[mInputPtr+4] == ';') {
1371                         mInputPtr += 5;
1372                         return '\'';
1373                     }
1374                 }
1375             } else if (c == 'l') {
1376                 if (avail >= 3
1377                         && mInputBuffer[mInputPtr+1] == 't'
1378                         && mInputBuffer[mInputPtr+2] == ';') {
1379                     mInputPtr += 3;
1380                     return '<';
1381                 }
1382             } else if (c == 'g') {
1383                 if (avail >= 3
1384                         && mInputBuffer[mInputPtr+1] == 't'
1385                         && mInputBuffer[mInputPtr+2] == ';') {
1386                     mInputPtr += 3;
1387                     return '>';
1388                 }
1389             } else if (c == 'q') {
1390                 if (avail >= 5
1391                         && mInputBuffer[mInputPtr+1] == 'u'
1392                         && mInputBuffer[mInputPtr+2] == 'o'
1393                         && mInputBuffer[mInputPtr+3] == 't'
1394                         && mInputBuffer[mInputPtr+4] == ';') {
1395                     mInputPtr += 5;
1396                     return '"';
1397                 }
1398             }
1399         }
1400         return 0;
1401     }
1402 
1403     /**
1404      * Reverse of {@link #resolveCharOnlyEntity}; will only resolve entity
1405      * if it is NOT a character entity (or pre-defined 'generic' entity;
1406      * amp, apos, lt, gt or quot). Only used in cases where entities
1407      * are to be separately returned unexpanded (in non-entity-replacing
1408      * mode); which means it's never called from dtd handler.
1409      */
1410     protected EntityDecl resolveNonCharEntity()
1411             throws XMLStreamException
1412     {
1413         //int avail = inputInBuffer();
1414         int avail = mInputEnd - mInputPtr;
1415         if (avail < 6) {
1416             // split entity, or buffer boundary
1417             /* Don't want to lose leading '&' (in case we can not expand
1418              * the entity), so let's push it back first
1419              */
1420             --mInputPtr;
1421 
1422             /* Shortest valid reference would be 3 chars ('&a;'); which
1423              * would only be legal from an expanded entity...
1424              */
1425             if (!ensureInput(6)) {
1426                 avail = inputInBuffer();
1427                 if (avail < 3) {
1428                     throwUnexpectedEOF(SUFFIX_IN_ENTITY_REF);
1429                 }
1430             } else {
1431                 avail = 6;
1432             }
1433             // ... and now we can move pointer back as well:
1434             ++mInputPtr;
1435         }
1436 
1437         // We don't care about char entities:
1438         char c = mInputBuffer[mInputPtr];
1439         if (c == '#') {
1440             return null;
1441         }
1442 
1443         /* 19-Aug-2004, TSa: Need special handling for pre-defined
1444          *   entities; they are not counted as 'real' general parsed
1445          *   entities, but more as character entities...
1446          */
1447 
1448         // have chars at least up to mInputPtr+4 by now
1449         if (c == 'a') {
1450             char d = mInputBuffer[mInputPtr+1];
1451             if (d == 'm') {
1452                 if (avail >= 4
1453                         && mInputBuffer[mInputPtr+2] == 'p'
1454                         && mInputBuffer[mInputPtr+3] == ';') {
1455                     // If not automatically expanding:
1456                     //return sEntityAmp;
1457                     // mInputPtr += 4;
1458                     return null;
1459                 }
1460             } else if (d == 'p') {
1461                 if (avail >= 5
1462                         && mInputBuffer[mInputPtr+2] == 'o'
1463                         && mInputBuffer[mInputPtr+3] == 's'
1464                         && mInputBuffer[mInputPtr+4] == ';') {
1465                     return null;
1466                 }
1467             }
1468         } else if (c == 'l') {
1469             if (avail >= 3
1470                     && mInputBuffer[mInputPtr+1] == 't'
1471                     && mInputBuffer[mInputPtr+2] == ';') {
1472                 return null;
1473             }
1474         } else if (c == 'g') {
1475             if (avail >= 3
1476                     && mInputBuffer[mInputPtr+1] == 't'
1477                     && mInputBuffer[mInputPtr+2] == ';') {
1478                 return null;
1479             }
1480         } else if (c == 'q') {
1481             if (avail >= 5
1482                     && mInputBuffer[mInputPtr+1] == 'u'
1483                     && mInputBuffer[mInputPtr+2] == 'o'
1484                     && mInputBuffer[mInputPtr+3] == 't'
1485                     && mInputBuffer[mInputPtr+4] == ';') {
1486                 return null;
1487             }
1488         }
1489 
1490         // Otherwise, let's just parse in generic way:
1491         ++mInputPtr; // since we already read the first letter
1492         String id = parseEntityName(c);
1493         mCurrName = id;
1494 
1495         return findEntity(id, null);
1496     }
1497 
1498     /**
1499      * Method that does full resolution of an entity reference, be it
1500      * character entity, internal entity or external entity, including
1501      * updating of input buffers, and depending on whether result is
1502      * a character entity (or one of 5 pre-defined entities), returns
1503      * char in question, or null character (code 0) to indicate it had
1504      * to change input source.
1505      *
1506      * @param allowExt If true, is allowed to expand external entities
1507      *   (expanding text); if false, is not (expanding attribute value).
1508      *
1509      * @return Either single-character replacement (which is NOT to be
1510      *    reparsed), or null char (0) to indicate expansion is done via
1511      *    input source.
1512      */
1513     protected int fullyResolveEntity(boolean allowExt)
1514             throws XMLStreamException
1515     {
1516         char c = getNextCharFromCurrent(SUFFIX_IN_ENTITY_REF);
1517         // Do we have a (numeric) character entity reference?
1518         if (c == '#') { // numeric
1519             final StringBuffer originalSurface = new StringBuffer("#");
1520             int ch = resolveCharEnt(originalSurface);
1521             if (mCfgTreatCharRefsAsEntities) {
1522                 final char[] originalChars = new char[originalSurface.length()];
1523                 originalSurface.getChars(0, originalSurface.length(), originalChars, 0);
1524                 mCurrEntity = getIntEntity(ch, originalChars);
1525                 return 0;
1526             }
1527             return ch;
1528         }
1529 
1530         String id = parseEntityName(c);
1531 
1532         // Perhaps we have a pre-defined char reference?
1533         c = id.charAt(0);
1534         /*
1535          * 16-May-2004, TSa: Should custom entities (or ones defined in int/ext subset) override
1536          * pre-defined settings for these?
1537          */
1538         char d = CHAR_NULL;
1539         if (c == 'a') { // amp or apos?
1540             if (id.equals("amp")) {
1541                 d = '&';
1542             } else if (id.equals("apos")) {
1543                 d = '\'';
1544             }
1545         } else if (c == 'g') { // gt?
1546             if (id.length() == 2 && id.charAt(1) == 't') {
1547                 d = '>';
1548             }
1549         } else if (c == 'l') { // lt?
1550             if (id.length() == 2 && id.charAt(1) == 't') {
1551                 d = '<';
1552             }
1553         } else if (c == 'q') { // quot?
1554             if (id.equals("quot")) {
1555                 d = '"';
1556             }
1557         }
1558 
1559         if (d != CHAR_NULL) {
1560             if (mCfgTreatCharRefsAsEntities) {
1561                 final char[] originalChars = new char[id.length()];
1562                 id.getChars(0, id.length(), originalChars, 0);
1563                 mCurrEntity = getIntEntity(d, originalChars);
1564                 return 0;
1565             }
1566             return d;
1567         }
1568 
1569         final EntityDecl e = expandEntity(id, allowExt, null);
1570         if (mCfgTreatCharRefsAsEntities) {
1571             mCurrEntity = e;
1572         }
1573         return 0;
1574     }
1575 
1576     /**
1577      * Returns an entity (possibly from cache) for the argument character using the encoded
1578      * representation in mInputBuffer[entityStartPos ... mInputPtr-1].
1579      */
1580     protected EntityDecl getIntEntity(int ch, final char[] originalChars)
1581     {
1582         String cacheKey = new String(originalChars);
1583 
1584         IntEntity entity = mCachedEntities.get(cacheKey);
1585         if (entity == null) {
1586             String repl;
1587             if (ch <= 0xFFFF) {
1588                 repl = Character.toString((char) ch);
1589             } else {
1590                 StringBuffer sb = new StringBuffer(2);
1591                 ch -= 0x10000;
1592                 sb.append((char) ((ch >> 10)  + 0xD800));
1593                 sb.append((char) ((ch & 0x3FF)  + 0xDC00));
1594                 repl = sb.toString();
1595             }
1596             entity = IntEntity.create(new String(originalChars), repl);
1597             mCachedEntities.put(cacheKey, entity);
1598         }
1599         return entity;
1600     }
1601 
1602 
1603     /**
1604      * Helper method that will try to expand a parsed entity (parameter or
1605      * generic entity).
1606      *<p>
1607      * note: called by sub-classes (dtd parser), needs to be protected.
1608      *
1609      * @param id Name of the entity being expanded
1610      * @param allowExt Whether external entities can be expanded or not; if
1611      *   not, and the entity to expand would be external one, an exception
1612      *   will be thrown
1613      */
1614     protected EntityDecl expandEntity(String id, boolean allowExt,
1615                                       Object extraArg)
1616             throws XMLStreamException
1617     {
1618         mCurrName = id;
1619 
1620         EntityDecl ed = findEntity(id, extraArg);
1621 
1622         if (ed == null) {
1623             /* 30-Sep-2005, TSa: As per [WSTX-5], let's only throw exception
1624              *   if we have to resolve it (otherwise it's just best-effort,
1625              *   and null is ok)
1626              */
1627             /* 02-Oct-2005, TSa: Plus, [WSTX-4] adds "undeclared entity
1628              *    resolver"
1629              */
1630             if (mCfgReplaceEntities) {
1631                 mCurrEntity = expandUnresolvedEntity(id);
1632             }
1633             return null;
1634         }
1635 
1636         if (!mCfgTreatCharRefsAsEntities || this instanceof MinimalDTDReader) {
1637             expandEntity(ed, allowExt);
1638         }
1639 
1640         return ed;
1641     }
1642 
1643     /**
1644      *<p>
1645      * note: defined as private for documentation, ie. it's just called
1646      * from within this class (not sub-classes), from one specific method
1647      * (see above)
1648      *
1649      * @param ed Entity to be expanded
1650      * @param allowExt Whether external entities are allowed or not.
1651      */
1652     private void expandEntity(EntityDecl ed, boolean allowExt)
1653             throws XMLStreamException
1654     {
1655         String id = ed.getName();
1656 
1657         /* Very first thing; we can immediately check if expanding
1658          * this entity would result in infinite recursion:
1659          */
1660         if (mInput.isOrIsExpandedFrom(id)) {
1661             throwRecursionError(id);
1662         }
1663 
1664         /* Should not refer unparsed entities from attribute values
1665          * or text content (except via notation mechanism, but that's
1666          * not parsed here)
1667          */
1668         if (!ed.isParsed()) {
1669             throwParseError("Illegal reference to unparsed external entity \"{0}\"", id, null);
1670         }
1671 
1672         // 28-Jun-2004, TSa: Do we support external entity expansion?
1673         boolean isExt = ed.isExternal();
1674         if (isExt) {
1675             if (!allowExt) { // never ok in attribute value...
1676                 throwParseError("Encountered a reference to external parsed entity \"{0}\" when expanding attribute value: not legal as per XML 1.0/1.1 #3.1", id, null);
1677             }
1678             if (!mConfig.willSupportExternalEntities()) {
1679                 throwParseError("Encountered a reference to external entity \"{0}\", but stream reader has feature \"{1}\" disabled",
1680                         id, XMLInputFactory.IS_SUPPORTING_EXTERNAL_ENTITIES);
1681             }
1682         }
1683         verifyLimit("Maximum entity expansion count", mConfig.getMaxEntityCount(), ++mEntityExpansionCount);
1684         // First, let's give current context chance to save its stuff
1685         WstxInputSource oldInput = mInput;
1686         oldInput.saveContext(this);
1687         WstxInputSource newInput = null;
1688         try {
1689             newInput = ed.expand(oldInput, mEntityResolver, mConfig, mDocXmlVersion);
1690         } catch (FileNotFoundException fex) {
1691             /* Let's catch and rethrow this just so we get more meaningful
1692              * description (with input source position etc)
1693              */
1694             throwParseError("(was {0}) {1}", fex.getClass().getName(), fex.getMessage());
1695         } catch (IOException ioe) {
1696             throw constructFromIOE(ioe);
1697         }
1698         /* And then we'll need to make sure new input comes from the new
1699          * input source
1700          */
1701         initInputSource(newInput, isExt, id);
1702     }
1703 
1704     /**
1705      *<p>
1706      * note: only called from the local expandEntity() method
1707      */
1708     private EntityDecl expandUnresolvedEntity(String id)
1709             throws XMLStreamException
1710     {
1711         XMLResolver resolver = mConfig.getUndeclaredEntityResolver();
1712         if (resolver != null) {
1713             /* Ok, we can check for recursion here; but let's only do that
1714              * if there is any chance that it might get resolved by
1715              * the special resolver (it must have been resolved this way
1716              * earlier, too...)
1717              */
1718             if (mInput.isOrIsExpandedFrom(id)) {
1719                 throwRecursionError(id);
1720             }
1721 
1722             WstxInputSource oldInput = mInput;
1723             oldInput.saveContext(this);
1724             // null, null -> no public or system ids
1725             int xmlVersion = mDocXmlVersion;
1726             // 05-Feb-2006, TSa: If xmlVersion not explicitly known, defaults to 1.0
1727             if (xmlVersion == XmlConsts.XML_V_UNKNOWN) {
1728                 xmlVersion = XmlConsts.XML_V_10;
1729             }
1730             WstxInputSource newInput;
1731             try {
1732                 newInput = DefaultInputResolver.resolveEntityUsing
1733                         (oldInput, id, null, null, resolver, mConfig, xmlVersion);
1734                 if (mCfgTreatCharRefsAsEntities) {
1735                     return new IntEntity(WstxInputLocation.getEmptyLocation(), newInput.getEntityId(),
1736                             newInput.getSource(), new char[]{}, WstxInputLocation.getEmptyLocation());
1737                 }
1738             } catch (IOException ioe) {
1739                 throw constructFromIOE(ioe);
1740             }
1741             if (newInput != null) {
1742                 // true -> is external
1743                 initInputSource(newInput, true, id);
1744                 return null;
1745             }
1746         }
1747         handleUndeclaredEntity(id);
1748         return null;
1749     }
1750 
1751     /*
1752     ///////////////////////////////////////////////////////////////////////
1753     // Abstract methods for sub-classes to implement
1754     ///////////////////////////////////////////////////////////////////////
1755      */
1756 
1757     /**
1758      * Abstract method for sub-classes to implement, for finding
1759      * a declared general or parsed entity.
1760      *
1761      * @param id Identifier of the entity to find
1762      * @param arg Optional argument passed from caller; needed by DTD
1763      *    reader.
1764      */
1765     protected abstract EntityDecl findEntity(String id, Object arg)
1766             throws XMLStreamException;
1767 
1768     /**
1769      * This method gets called if a declaration for an entity was not
1770      * found in entity expanding mode (enabled by default for xml reader,
1771      * always enabled for dtd reader).
1772      */
1773     protected abstract void handleUndeclaredEntity(String id)
1774             throws XMLStreamException;
1775 
1776     protected abstract void handleIncompleteEntityProblem(WstxInputSource closing)
1777             throws XMLStreamException;
1778 
1779     /*
1780     ///////////////////////////////////////////////////////////////////////
1781     // Basic tokenization
1782     ///////////////////////////////////////////////////////////////////////
1783      */
1784 
1785     /**
1786      * Method that will parse name token (roughly equivalent to XML specs;
1787      * although bit lenier for more efficient handling); either uri prefix,
1788      * or local name.
1789      *<p>
1790      * Much of complexity in this method has to do with the intention to
1791      * try to avoid any character copies. In this optimal case algorithm
1792      * would be fairly simple. However, this only works if all data is
1793      * already in input buffer... if not, copy has to be made halfway
1794      * through parsing, and that complicates things.
1795      *<p>
1796      * One thing to note is that String returned has been canonicalized
1797      * and (if necessary) added to symbol table. It can thus be compared
1798      * against other such (usually id) Strings, with simple equality operator.
1799      *
1800      * @param c First character of the name; not yet checked for validity
1801      *
1802      * @return Canonicalized name String (which may have length 0, if
1803      *    EOF or non-name-start char encountered)
1804      */
1805     protected String parseLocalName(char c)
1806             throws XMLStreamException
1807     {
1808         /* Has to start with letter, or '_' (etc); we won't allow ':' as that
1809          * is taken as namespace separator; no use trying to optimize
1810          * heavily as it's 98% likely it is a valid char...
1811          */
1812         if (!isNameStartChar(c)) {
1813             if (c == ':') {
1814                 throwUnexpectedChar(c, " (missing namespace prefix?)");
1815             }
1816             throwUnexpectedChar(c, " (expected a name start character)");
1817         }
1818 
1819         int ptr = mInputPtr;
1820         int hash = c;
1821         final int inputLen = mInputEnd;
1822         int startPtr = ptr-1; // already read previous char
1823         final char[] inputBuf = mInputBuffer;
1824 
1825         /* After which there may be zero or more name chars
1826          * we have to consider
1827          */
1828         while (true) {
1829             if (ptr >= inputLen) {
1830                 /* Ok, identifier may continue past buffer end, need
1831                  * to continue with part 2 (separate method, as this is
1832                  * not as common as having it all in buffer)
1833                  */
1834                 mInputPtr = ptr;
1835                 return parseLocalName2(startPtr, hash);
1836             }
1837             // Ok, we have the char... is it a name char?
1838             c = inputBuf[ptr];
1839             if (c < CHAR_LOWEST_LEGAL_LOCALNAME_CHAR) {
1840                 break;
1841             }
1842             if (!isNameChar(c)) {
1843                 break;
1844             }
1845             hash = (hash * 31) + c;
1846             ++ptr;
1847         }
1848         mInputPtr = ptr;
1849         return mSymbols.findSymbol(mInputBuffer, startPtr, ptr - startPtr, hash);
1850     }
1851 
1852     /**
1853      * Second part of name token parsing; called when name can continue
1854      * past input buffer end (so only part was read before calling this
1855      * method to read the rest).
1856      *<p>
1857      * Note that this isn't heavily optimized, on assumption it's not
1858      * called very often.
1859      */
1860     protected String parseLocalName2(int start, int hash)
1861             throws XMLStreamException
1862     {
1863         int ptr = mInputEnd - start;
1864         // Let's assume fairly short names
1865         char[] outBuf = getNameBuffer(ptr+8);
1866 
1867         if (ptr > 0) {
1868             System.arraycopy(mInputBuffer, start, outBuf, 0, ptr);
1869         }
1870 
1871         int outLen = outBuf.length;
1872         while (true) {
1873             // note: names can not cross input block (entity) boundaries...
1874             if (mInputPtr >= mInputEnd) {
1875                 if (!loadMoreFromCurrent()) {
1876                     break;
1877                 }
1878             }
1879             char c = mInputBuffer[mInputPtr];
1880             if (c < CHAR_LOWEST_LEGAL_LOCALNAME_CHAR) {
1881                 break;
1882             }
1883             if (!isNameChar(c)) {
1884                 break;
1885             }
1886             ++mInputPtr;
1887             if (ptr >= outLen) {
1888                 mNameBuffer = outBuf = expandBy50Pct(outBuf);
1889                 outLen = outBuf.length;
1890             }
1891             outBuf[ptr++] = c;
1892             hash = (hash * 31) + c;
1893         }
1894         // Still need to canonicalize the name:
1895         return mSymbols.findSymbol(outBuf, 0, ptr, hash);
1896     }
1897 
1898     /**
1899      * Method that will parse 'full' name token; what full means depends on
1900      * whether reader is namespace aware or not. If it is, full name means
1901      * local name with no namespace prefix (PI target, entity/notation name);
1902      * if not, name can contain arbitrary number of colons. Note that
1903      * element and attribute names are NOT parsed here, so actual namespace
1904      * prefix separation can be handled properly there.
1905      *<p>
1906      * Similar to {@link #parseLocalName}, much of complexity stems from
1907      * trying to avoid copying name characters from input buffer.
1908      *<p>
1909      * Note that returned String will be canonicalized, similar to
1910      * {@link #parseLocalName}, but without separating prefix/local name.
1911      *
1912      * @return Canonicalized name String (which may have length 0, if
1913      *    EOF or non-name-start char encountered)
1914      */
1915     protected String parseFullName()
1916             throws XMLStreamException
1917     {
1918         if (mInputPtr >= mInputEnd) {
1919             loadMoreFromCurrent();
1920         }
1921         return parseFullName(mInputBuffer[mInputPtr++]);
1922     }
1923 
1924     protected String parseFullName(char c)
1925             throws XMLStreamException
1926     {
1927         // First char has special handling:
1928         if (!isNameStartChar(c)) {
1929             if (c == ':') { // no name.... generally an error:
1930                 if (mCfgNsEnabled) {
1931                     throwNsColonException(parseFNameForError());
1932                 }
1933                 // Ok, that's fine actually
1934             } else {
1935                 if (c <= CHAR_SPACE) {
1936                     throwUnexpectedChar(c, " (missing name?)");
1937                 }
1938                 throwUnexpectedChar(c, " (expected a name start character)");
1939             }
1940         }
1941 
1942         int ptr = mInputPtr;
1943         int hash = c;
1944         int inputLen = mInputEnd;
1945         int startPtr = ptr-1; // to account for the first char
1946 
1947         /* After which there may be zero or more name chars
1948          * we have to consider
1949          */
1950         while (true) {
1951             if (ptr >= inputLen) {
1952                 /* Ok, identifier may continue past buffer end, need
1953                  * to continue with part 2 (separate method, as this is
1954                  * not as common as having it all in buffer)
1955                  */
1956                 mInputPtr = ptr;
1957                 return parseFullName2(startPtr, hash);
1958             }
1959             c = mInputBuffer[ptr];
1960             if (c == ':') { // colon only allowed in non-NS mode
1961                 if (mCfgNsEnabled) {
1962                     mInputPtr = ptr;
1963                     throwNsColonException(new String(mInputBuffer, startPtr, ptr - startPtr) + parseFNameForError());
1964                 }
1965             } else {
1966                 if (c < CHAR_LOWEST_LEGAL_LOCALNAME_CHAR) {
1967                     break;
1968                 }
1969                 if (!isNameChar(c)) {
1970                     break;
1971                 }
1972             }
1973             hash = (hash * 31) + c;
1974             ++ptr;
1975         }
1976         mInputPtr = ptr;
1977         return mSymbols.findSymbol(mInputBuffer, startPtr, ptr - startPtr, hash);
1978     }
1979 
1980     @SuppressWarnings("cast")
1981     protected String parseFullName2(int start, int hash)
1982             throws XMLStreamException
1983     {
1984         int ptr = mInputEnd - start;
1985         // Let's assume fairly short names
1986         char[] outBuf = getNameBuffer(ptr+8);
1987 
1988         if (ptr > 0) {
1989             System.arraycopy(mInputBuffer, start, outBuf, 0, ptr);
1990         }
1991 
1992         int outLen = outBuf.length;
1993         while (true) {
1994             /* 06-Sep-2004, TSa: Name tokens are not allowed to continue
1995              *   past entity expansion ranges... that is, all characters
1996              *   have to come from the same input source. Thus, let's only
1997              *   load things from same input level
1998              */
1999             if (mInputPtr >= mInputEnd) {
2000                 if (!loadMoreFromCurrent()) {
2001                     break;
2002                 }
2003             }
2004             char c = mInputBuffer[mInputPtr];
2005             if (c == ':') { // colon only allowed in non-NS mode
2006                 if (mCfgNsEnabled) {
2007                     throwNsColonException(new String(outBuf, 0, ptr) + c + parseFNameForError());
2008                 }
2009             } else if (c < CHAR_LOWEST_LEGAL_LOCALNAME_CHAR) {
2010                 break;
2011             } else if (!isNameChar(c)) {
2012                 break;
2013             }
2014             ++mInputPtr;
2015 
2016             if (ptr >= outLen) {
2017                 mNameBuffer = outBuf = expandBy50Pct(outBuf);
2018                 outLen = outBuf.length;
2019             }
2020             outBuf[ptr++] = c;
2021             hash = (hash * 31) + (int) c;
2022         }
2023 
2024         // Still need to canonicalize the name:
2025         return mSymbols.findSymbol(outBuf, 0, ptr, hash);
2026     }
2027 
2028     /**
2029      * Method called to read in full name, including unlimited number of
2030      * namespace separators (':'), for the purpose of displaying name in
2031      * an error message. Won't do any further validations, and parsing
2032      * is not optimized: main need is just to get more meaningful error
2033      * messages.
2034      */
2035     protected String parseFNameForError()
2036             throws XMLStreamException
2037     {
2038         StringBuilder sb = new StringBuilder(100);
2039         while (true) {
2040             char c;
2041 
2042             if (mInputPtr < mInputEnd) {
2043                 c = mInputBuffer[mInputPtr++];
2044             } else { // can't error here, so let's accept EOF for now:
2045                 int i = getNext();
2046                 if (i < 0) {
2047                     break;
2048                 }
2049                 c = (char) i;
2050             }
2051             if (c != ':' && !isNameChar(c)) {
2052                 --mInputPtr;
2053                 break;
2054             }
2055             sb.append(c);
2056         }
2057         return sb.toString();
2058     }
2059 
2060     protected final String parseEntityName(char c)
2061             throws XMLStreamException
2062     {
2063         String id = parseFullName(c);
2064         // Needs to be followed by a semi-colon, too.. from same input source:
2065         if (mInputPtr >= mInputEnd) {
2066             if (!loadMoreFromCurrent()) {
2067                 throwParseError("Missing semicolon after reference for entity \"{0}\"", id, null);
2068             }
2069         }
2070         c = mInputBuffer[mInputPtr++];
2071         if (c != ';') {
2072             throwUnexpectedChar(c, "; expected a semi-colon after the reference for entity '"+id+"'");
2073         }
2074         return id;
2075     }
2076 
2077     /**
2078      * Note: does not check for number of colons, amongst other things.
2079      * Main idea is to skip through what superficially seems like a valid
2080      * id, nothing more. This is only done when really skipping through
2081      * something we do not care about at all: not even whether names/ids
2082      * would be valid (for example, when ignoring internal DTD subset).
2083      *
2084      * @return Length of skipped name.
2085      */
2086     protected int skipFullName(char c)
2087             throws XMLStreamException
2088     {
2089         if (!isNameStartChar(c)) {
2090             --mInputPtr;
2091             return 0;
2092         }
2093 
2094         /* After which there may be zero or more name chars
2095          * we have to consider
2096          */
2097         int count = 1;
2098         while (true) {
2099             c = (mInputPtr < mInputEnd) ?
2100                     mInputBuffer[mInputPtr++] : getNextChar(SUFFIX_EOF_EXP_NAME);
2101             if (c != ':' && !isNameChar(c)) {
2102                 break;
2103             }
2104             ++count;
2105         }
2106         return count;
2107     }
2108 
2109     /**
2110      * Simple parsing method that parses system ids, which are generally
2111      * used in entities (from DOCTYPE declaration to internal/external
2112      * subsets).
2113      *<p>
2114      * NOTE: returned String is not canonicalized, on assumption that
2115      * external ids may be longish, and are not shared all that often, as
2116      * they are generally just used for resolving paths, if anything.
2117      *<br />
2118      * Also note that this method is not heavily optimized, as it's not
2119      * likely to be a bottleneck for parsing.
2120      */
2121     protected final String parseSystemId(char quoteChar, boolean convertLFs,
2122                                          String errorMsg)
2123             throws XMLStreamException
2124     {
2125         char[] buf = getNameBuffer(-1);
2126         int ptr = 0;
2127 
2128         while (true) {
2129             char c = (mInputPtr < mInputEnd) ?
2130                     mInputBuffer[mInputPtr++] : getNextChar(errorMsg);
2131             if (c == quoteChar) {
2132                 break;
2133             }
2134             /* ??? 14-Jun-2004, TSa: Should we normalize linefeeds or not?
2135              *   It seems like we should, for all input... so that's the way it
2136              *   works.
2137              */
2138             if (c == '\n') {
2139                 markLF();
2140             } else if (c == '\r') {
2141                 if (peekNext() == '\n') {
2142                     ++mInputPtr;
2143                     if (!convertLFs) {
2144                         /* The only tricky thing; need to preserve 2-char LF; need to
2145                          * output one char from here, then can fall back to default:
2146                          */
2147                         if (ptr >= buf.length) {
2148                             buf = expandBy50Pct(buf);
2149                         }
2150                         buf[ptr++] = '\r';
2151                     }
2152                     c = '\n';
2153                 } else if (convertLFs) {
2154                     c = '\n';
2155                 }
2156             }
2157 
2158             // Other than that, let's just append it:
2159             if (ptr >= buf.length) {
2160                 buf = expandBy50Pct(buf);
2161             }
2162             buf[ptr++] = c;
2163         }
2164 
2165         return (ptr == 0) ? "" : new String(buf, 0, ptr);
2166     }
2167 
2168     /**
2169      * Simple parsing method that parses system ids, which are generally
2170      * used in entities (from DOCTYPE declaration to internal/external
2171      * subsets).
2172      *<p>
2173      * As per xml specs, the contents are actually normalized.
2174      *<p>
2175      * NOTE: returned String is not canonicalized, on assumption that
2176      * external ids may be longish, and are not shared all that often, as
2177      * they are generally just used for resolving paths, if anything.
2178      *<br />
2179      * Also note that this method is not heavily optimized, as it's not
2180      * likely to be a bottleneck for parsing.
2181      */
2182     protected final String parsePublicId(char quoteChar, String errorMsg)
2183             throws XMLStreamException
2184     {
2185         char[] buf = getNameBuffer(-1);
2186         int ptr = 0;
2187         boolean spaceToAdd = false;
2188 
2189         while (true) {
2190             char c = (mInputPtr < mInputEnd) ?
2191                     mInputBuffer[mInputPtr++] : getNextChar(errorMsg);
2192             if (c == quoteChar) {
2193                 break;
2194             }
2195             if (c == '\n') {
2196                 markLF();
2197                 spaceToAdd = true;
2198                 continue;
2199             } else if (c == '\r') {
2200                 if (peekNext() == '\n') {
2201                     ++mInputPtr;
2202                 }
2203                 spaceToAdd = true;
2204                 continue;
2205             } else if (c == CHAR_SPACE) {
2206                 spaceToAdd = true;
2207                 continue;
2208             } else {
2209                 // Verify it's a legal pubid char (see XML spec, #13, from 2.3)
2210                 if ((c >= VALID_PUBID_CHAR_COUNT)
2211                         || sPubidValidity[c] != PUBID_CHAR_VALID_B) {
2212                     throwUnexpectedChar(c, " in public identifier");
2213                 }
2214             }
2215 
2216             // Other than that, let's just append it:
2217             if (ptr >= buf.length) {
2218                 buf = expandBy50Pct(buf);
2219             }
2220             /* Space-normalization means scrapping leading and trailing
2221              * white space, and coalescing remaining ws into single spaces.
2222              */
2223             if (spaceToAdd) { // pending white space to add?
2224                 if (c == CHAR_SPACE) { // still a space; let's skip
2225                     continue;
2226                 }
2227                 /* ok: if we have non-space, we'll either forget about
2228                  * space(s) (if nothing has been output, ie. leading space),
2229                  * or output a single space (in-between non-white space)
2230                  */
2231                 spaceToAdd = false;
2232                 if (ptr > 0) {
2233                     buf[ptr++] = CHAR_SPACE;
2234                     if (ptr >= buf.length) {
2235                         buf = expandBy50Pct(buf);
2236                     }
2237                 }
2238             }
2239             buf[ptr++] = c;
2240         }
2241 
2242         return (ptr == 0) ? "" : new String(buf, 0, ptr);
2243     }
2244 
2245     protected final void parseUntil(TextBuffer tb, char endChar, boolean convertLFs,
2246                                     String errorMsg)
2247             throws XMLStreamException
2248     {
2249         // Let's first ensure we have some data in there...
2250         if (mInputPtr >= mInputEnd) {
2251             loadMore(errorMsg);
2252         }
2253         while (true) {
2254             // Let's loop consequtive 'easy' spans:
2255             char[] inputBuf = mInputBuffer;
2256             int inputLen = mInputEnd;
2257             int ptr = mInputPtr;
2258             int startPtr = ptr;
2259             while (ptr < inputLen) {
2260                 char c = inputBuf[ptr++];
2261                 if (c == endChar) {
2262                     int thisLen = ptr - startPtr - 1;
2263                     if (thisLen > 0) {
2264                         tb.append(inputBuf, startPtr, thisLen);
2265                     }
2266                     mInputPtr = ptr;
2267                     return;
2268                 }
2269                 if (c == '\n') {
2270                     mInputPtr = ptr; // markLF() requires this
2271                     markLF();
2272                 } else if (c == '\r') {
2273                     if (!convertLFs && ptr < inputLen) {
2274                         if (inputBuf[ptr] == '\n') {
2275                             ++ptr;
2276                         }
2277                         mInputPtr = ptr;
2278                         markLF();
2279                     } else {
2280                         int thisLen = ptr - startPtr - 1;
2281                         if (thisLen > 0) {
2282                             tb.append(inputBuf, startPtr, thisLen);
2283                         }
2284                         mInputPtr = ptr;
2285                         c = getNextChar(errorMsg);
2286                         if (c != '\n') {
2287                             --mInputPtr; // pusback
2288                             tb.append(convertLFs ? '\n' : '\r');
2289                         } else {
2290                             if (convertLFs) {
2291                                 tb.append('\n');
2292                             } else {
2293                                 tb.append('\r');
2294                                 tb.append('\n');
2295                             }
2296                         }
2297                         startPtr = ptr = mInputPtr;
2298                         markLF();
2299                     }
2300                 }
2301             }
2302             int thisLen = ptr - startPtr;
2303             if (thisLen > 0) {
2304                 tb.append(inputBuf, startPtr, thisLen);
2305             }
2306             loadMore(errorMsg);
2307             startPtr = ptr = mInputPtr;
2308             inputBuf = mInputBuffer;
2309             inputLen = mInputEnd;
2310         }
2311     }
2312 
2313     /*
2314     ///////////////////////////////////////////////////////////////////////
2315     // Internal methods
2316     ///////////////////////////////////////////////////////////////////////
2317      */
2318 
2319     private int resolveCharEnt(StringBuffer originalCharacters)
2320             throws XMLStreamException
2321     {
2322         int value = 0;
2323         char c = getNextChar(SUFFIX_IN_ENTITY_REF);
2324 
2325         if (originalCharacters != null) {
2326             originalCharacters.append(c);
2327         }
2328 
2329         if (c == 'x') { // hex
2330             while (true) {
2331                 c = (mInputPtr < mInputEnd) ? mInputBuffer[mInputPtr++]
2332                         : getNextCharFromCurrent(SUFFIX_IN_ENTITY_REF);
2333                 if (c == ';') {
2334                     break;
2335                 }
2336 
2337                 if (originalCharacters != null) {
2338                     originalCharacters.append(c);
2339                 }
2340                 value = value << 4;
2341                 if (c <= '9' && c >= '0') {
2342                     value += (c - '0');
2343                 } else if (c >= 'a' && c <= 'f') {
2344                     value += 10 + (c - 'a');
2345                 } else if (c >= 'A' && c <= 'F') {
2346                     value += 10 + (c - 'A');
2347                 } else {
2348                     throwUnexpectedChar(c, "; expected a hex digit (0-9a-fA-F).");
2349                 }
2350                 // Overflow?
2351                 if (value > MAX_UNICODE_CHAR) {
2352                     reportUnicodeOverflow();
2353                 }
2354             }
2355         } else { // numeric (decimal)
2356             while (c != ';') {
2357                 if (c <= '9' && c >= '0') {
2358                     value = (value * 10) + (c - '0');
2359                     // Overflow?
2360                     if (value > MAX_UNICODE_CHAR) {
2361                         reportUnicodeOverflow();
2362                     }
2363                 } else {
2364                     throwUnexpectedChar(c, "; expected a decimal number.");
2365                 }
2366                 c = (mInputPtr < mInputEnd) ? mInputBuffer[mInputPtr++]
2367                         : getNextCharFromCurrent(SUFFIX_IN_ENTITY_REF);
2368 
2369                 if (originalCharacters != null && c != ';') {
2370                     originalCharacters.append(c);
2371                 }
2372             }
2373         }
2374         validateChar(value);
2375         return value;
2376     }
2377 
2378     /**
2379      * Method that will verify that expanded Unicode codepoint is a valid
2380      * XML content character.
2381      */
2382     private final void validateChar(int value)
2383             throws XMLStreamException
2384     {
2385         /* 24-Jan-2006, TSa: Ok, "high" Unicode chars are problematic,
2386          *   need to be reported by a surrogate pair..
2387          */
2388         if (value >= 0xD800) {
2389             if (value < 0xE000) { // no surrogates via entity expansion
2390                 reportIllegalChar(value);
2391             }
2392             if (value > 0xFFFF) {
2393                 // Within valid range at all?
2394                 if (value > MAX_UNICODE_CHAR) {
2395                     reportUnicodeOverflow();
2396                 }
2397             } else if (value >= 0xFFFE) { // 0xFFFE and 0xFFFF are illegal too
2398                 reportIllegalChar(value);
2399             }
2400             // Ok, fine as is
2401         } else if (value < 32) {
2402             if (value == 0) {
2403                 throwParseError("Invalid character reference: null character not allowed in XML content.");
2404             }
2405             // XML 1.1 allows most other chars; 1.0 does not:
2406             if (!mXml10AllowAllEscapedChars) {
2407                 if (!mXml11 &&
2408                         (value != 0x9 && value != 0xA && value != 0xD)) {
2409                     reportIllegalChar(value);
2410                 }
2411             }
2412         }
2413     }
2414 
2415     protected final char[] getNameBuffer(int minSize)
2416     {
2417         char[] buf = mNameBuffer;
2418 
2419         if (buf == null) {
2420             mNameBuffer = buf = new char[(minSize > 48) ? (minSize+16) : 64];
2421         } else if (minSize >= buf.length) { // let's allow one char extra...
2422             int len = buf.length;
2423             len += (len >> 1); // grow by 50%
2424             mNameBuffer = buf = new char[(minSize >= len) ? (minSize+16) : len];
2425         }
2426         return buf;
2427     }
2428 
2429     protected final char[] expandBy50Pct(char[] buf)
2430     {
2431         int len = buf.length;
2432         char[] newBuf = new char[len + (len >> 1)];
2433         System.arraycopy(buf, 0, newBuf, 0, len);
2434         return newBuf;
2435     }
2436 
2437     /**
2438      * Method called to throw an exception indicating that a name that
2439      * should not be namespace-qualified (PI target, entity/notation name)
2440      * is one, and reader is namespace aware.
2441      */
2442     private void throwNsColonException(String name)
2443             throws XMLStreamException
2444     {
2445         throwParseError("Illegal name \"{0}\" (PI target, entity/notation name): can not contain a colon (XML Namespaces 1.0#6)", name, null);
2446     }
2447 
2448     private void throwRecursionError(String entityName)
2449             throws XMLStreamException
2450     {
2451         throwParseError("Illegal entity expansion: entity \"{0}\" expands itself recursively.", entityName, null);
2452     }
2453 
2454     private void reportUnicodeOverflow()
2455             throws XMLStreamException
2456     {
2457         throwParseError("Illegal character entity: value higher than max allowed (0x{0})", Integer.toHexString(MAX_UNICODE_CHAR), null);
2458     }
2459 
2460     private void reportIllegalChar(int value)
2461             throws XMLStreamException
2462     {
2463         throwParseError("Illegal character entity: expansion character (code 0x{0}", Integer.toHexString(value), null);
2464     }
2465 
2466     protected void verifyLimit(String type, long maxValue, long currentValue)
2467             throws XMLStreamException
2468     {
2469         if (currentValue > maxValue) {
2470             throw constructLimitViolation(type, maxValue);
2471         }
2472     }
2473 
2474     protected XMLStreamException constructLimitViolation(String type, long limit)
2475             throws XMLStreamException
2476     {
2477         return new XMLStreamException(type+" limit ("+limit+") exceeded");
2478     }
2479 }