1 /* Woodstox XML processor
2 *
3 * Copyright (c) 2004- Tatu Saloranta, tatu.saloranta@iki.fi
4 *
5 * Licensed under the License specified in file LICENSE, included with
6 * the source code.
7 * You may not use this file except in compliance with the License.
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16 package com.ctc.wstx.sr;
17
18 import java.io.FileNotFoundException;
19 import java.io.IOException;
20 import java.net.URL;
21 import java.text.MessageFormat;
22 import java.util.Collections;
23 import java.util.HashMap;
24 import java.util.Map;
25
26 import javax.xml.stream.Location;
27 import javax.xml.stream.XMLInputFactory;
28 import javax.xml.stream.XMLReporter;
29 import javax.xml.stream.XMLResolver;
30 import javax.xml.stream.XMLStreamException;
31
32 import org.codehaus.stax2.XMLReporter2;
33 import org.codehaus.stax2.XMLStreamLocation2;
34 import org.codehaus.stax2.validation.XMLValidationProblem;
35
36 import com.ctc.wstx.api.ReaderConfig;
37 import com.ctc.wstx.cfg.ErrorConsts;
38 import com.ctc.wstx.cfg.InputConfigFlags;
39 import com.ctc.wstx.cfg.ParsingErrorMsgs;
40 import com.ctc.wstx.cfg.XmlConsts;
41 import com.ctc.wstx.dtd.MinimalDTDReader;
42 import com.ctc.wstx.ent.EntityDecl;
43 import com.ctc.wstx.ent.IntEntity;
44 import com.ctc.wstx.exc.*;
45 import com.ctc.wstx.io.DefaultInputResolver;
46 import com.ctc.wstx.io.WstxInputData;
47 import com.ctc.wstx.io.WstxInputLocation;
48 import com.ctc.wstx.io.WstxInputSource;
49 import com.ctc.wstx.util.ExceptionUtil;
50 import com.ctc.wstx.util.SymbolTable;
51 import com.ctc.wstx.util.TextBuffer;
52
53 /**
54 * Abstract base class that defines some basic functionality that all
55 * Woodstox reader classes (main XML reader, DTD reader) extend from.
56 */
57 public abstract class StreamScanner
58 extends WstxInputData
59 implements InputProblemReporter,
60 InputConfigFlags, ParsingErrorMsgs
61 {
62
63 // // // Some well-known chars:
64
65 /**
66 * Last (highest) char code of the three, LF, CR and NULL
67 */
68 public final static char CHAR_CR_LF_OR_NULL = (char) 13;
69
70 public final static int INT_CR_LF_OR_NULL = 13;
71
72 /**
73 * Character that allows quick check of whether a char can potentially
74 * be some kind of markup, WRT input stream processing;
75 * has to contain linefeeds, &, < and > (">" only matters when
76 * quoting text, as part of "]]>")
77 */
78 protected final static char CHAR_FIRST_PURE_TEXT = (char) ('>' + 1);
79
80
81 /**
82 * First character in Unicode (ie one with lowest id) that is legal
83 * as part of a local name (all valid name chars minus ':'). Used
84 * for doing quick check for local name end; usually name ends in
85 * a whitespace or equals sign.
86 */
87 protected final static char CHAR_LOWEST_LEGAL_LOCALNAME_CHAR = '-';
88
89 /*
90 ///////////////////////////////////////////////////////////////////////
91 // Character validity constants, structs
92 ///////////////////////////////////////////////////////////////////////
93 */
94
95 /**
96 * We will only use validity array for first 256 characters, mostly
97 * because after those characters it's easier to do fairly simple
98 * block checks.
99 */
100 private final static int VALID_CHAR_COUNT = 0x100;
101
102 private final static byte NAME_CHAR_INVALID_B = (byte) 0;
103 private final static byte NAME_CHAR_ALL_VALID_B = (byte) 1;
104 private final static byte NAME_CHAR_VALID_NONFIRST_B = (byte) -1;
105
106 private final static byte[] sCharValidity = new byte[VALID_CHAR_COUNT];
107
108 static {
109 // First, since all valid-as-first chars are also valid-as-other chars,
110 // we'll initialize common chars:
111 sCharValidity['_'] = NAME_CHAR_ALL_VALID_B;
112 for (int i = 0, last = ('z' - 'a'); i <= last; ++i) {
113 sCharValidity['A' + i] = NAME_CHAR_ALL_VALID_B;
114 sCharValidity['a' + i] = NAME_CHAR_ALL_VALID_B;
115 }
116 for (int i = 0xC0; i < 0xF6; ++i) { // not all are fully valid, but
117 sCharValidity[i] = NAME_CHAR_ALL_VALID_B;
118 }
119 // ... now we can 'revert' ones not fully valid:
120 sCharValidity[0xD7] = NAME_CHAR_INVALID_B;
121 sCharValidity[0xF7] = NAME_CHAR_INVALID_B;
122
123 // And then we can proceed with ones only valid-as-other.
124 sCharValidity['-'] = NAME_CHAR_VALID_NONFIRST_B;
125 sCharValidity['.'] = NAME_CHAR_VALID_NONFIRST_B;
126 sCharValidity[0xB7] = NAME_CHAR_VALID_NONFIRST_B;
127 for (int i = '0'; i <= '9'; ++i) {
128 sCharValidity[i] = NAME_CHAR_VALID_NONFIRST_B;
129 }
130 }
131
132 /**
133 * Public identifiers only use 7-bit ascii range.
134 */
135 private final static int VALID_PUBID_CHAR_COUNT = 0x80;
136 private final static byte[] sPubidValidity = new byte[VALID_PUBID_CHAR_COUNT];
137 // private final static byte PUBID_CHAR_INVALID_B = (byte) 0;
138 private final static byte PUBID_CHAR_VALID_B = (byte) 1;
139 static {
140 for (int i = 0, last = ('z' - 'a'); i <= last; ++i) {
141 sPubidValidity['A' + i] = PUBID_CHAR_VALID_B;
142 sPubidValidity['a' + i] = PUBID_CHAR_VALID_B;
143 }
144 for (int i = '0'; i <= '9'; ++i) {
145 sPubidValidity[i] = PUBID_CHAR_VALID_B;
146 }
147
148 // 3 main white space types are valid
149 sPubidValidity[0x0A] = PUBID_CHAR_VALID_B;
150 sPubidValidity[0x0D] = PUBID_CHAR_VALID_B;
151 sPubidValidity[0x20] = PUBID_CHAR_VALID_B;
152
153 // And many of punctuation/separator ascii chars too:
154 sPubidValidity['-'] = PUBID_CHAR_VALID_B;
155 sPubidValidity['\''] = PUBID_CHAR_VALID_B;
156 sPubidValidity['('] = PUBID_CHAR_VALID_B;
157 sPubidValidity[')'] = PUBID_CHAR_VALID_B;
158 sPubidValidity['+'] = PUBID_CHAR_VALID_B;
159 sPubidValidity[','] = PUBID_CHAR_VALID_B;
160 sPubidValidity['.'] = PUBID_CHAR_VALID_B;
161 sPubidValidity['/'] = PUBID_CHAR_VALID_B;
162 sPubidValidity[':'] = PUBID_CHAR_VALID_B;
163 sPubidValidity['='] = PUBID_CHAR_VALID_B;
164 sPubidValidity['?'] = PUBID_CHAR_VALID_B;
165 sPubidValidity[';'] = PUBID_CHAR_VALID_B;
166 sPubidValidity['!'] = PUBID_CHAR_VALID_B;
167 sPubidValidity['*'] = PUBID_CHAR_VALID_B;
168 sPubidValidity['#'] = PUBID_CHAR_VALID_B;
169 sPubidValidity['@'] = PUBID_CHAR_VALID_B;
170 sPubidValidity['$'] = PUBID_CHAR_VALID_B;
171 sPubidValidity['_'] = PUBID_CHAR_VALID_B;
172 sPubidValidity['%'] = PUBID_CHAR_VALID_B;
173 }
174
175 /*
176 ///////////////////////////////////////////////////////////////////////
177 // Basic configuration
178 ///////////////////////////////////////////////////////////////////////
179 */
180
181 /**
182 * Copy of the configuration object passed by the factory.
183 * Contains immutable settings for this reader (or in case
184 * of DTD parsers, reader that uses it)
185 */
186 protected final ReaderConfig mConfig;
187
188 // // // Various extracted settings:
189
190 /**
191 * If true, Reader is namespace aware, and should do basic checks
192 * (usually enforcing limitations on having colons in names)
193 */
194 protected final boolean mCfgNsEnabled;
195
196 // Extracted standard on/off settings:
197
198 /**
199 * note: left non-final on purpose: sub-class may need to modify
200 * the default value after construction.
201 */
202 protected boolean mCfgReplaceEntities;
203
204 /*
205 ///////////////////////////////////////////////////////////////////////
206 // Symbol handling, if applicable
207 ///////////////////////////////////////////////////////////////////////
208 */
209
210 final SymbolTable mSymbols;
211
212 /**
213 * Local full name for the event, if it has one (note: element events
214 * do NOT use this variable; those names are stored in element stack):
215 * target for processing instructions.
216 *<p>
217 * Currently used for proc. instr. target, and entity name (at least
218 * when current entity reference is null).
219 *<p>
220 * Note: this variable is generally not cleared, since it comes from
221 * a symbol table, ie. this won't be the only reference.
222 */
223 protected String mCurrName;
224
225 /*
226 ///////////////////////////////////////////////////////////////////////
227 // Input handling
228 ///////////////////////////////////////////////////////////////////////
229 */
230
231 /**
232 * Currently active input source; contains link to parent (nesting) input
233 * sources, if any.
234 */
235 protected WstxInputSource mInput;
236
237 /**
238 * Top-most input source this reader can use; due to input source
239 * chaining, this is not necessarily the root of all input; for example,
240 * external DTD subset reader's root input still has original document
241 * input as its parent.
242 */
243 protected final WstxInputSource mRootInput;
244
245 /**
246 * Custom resolver used to handle external entities that are to be expanded
247 * by this reader (external param/general entity expander)
248 */
249 protected XMLResolver mEntityResolver = null;
250
251 /**
252 * This is the current depth of the input stack (same as what input
253 * element stack would return as its depth).
254 * It is used to enforce input scope constraints for nesting of
255 * elements (for xml reader) and dtd declaration (for dtd reader)
256 * with regards to input block (entity expansion) boundaries.
257 *<p>
258 * Basically this value is compared to {@link #mInputTopDepth}, which
259 * indicates what was the depth at the point where the currently active
260 * input scope/block was started.
261 */
262 protected int mCurrDepth;
263
264 protected int mInputTopDepth;
265
266 /**
267 * Number of times a parsed general entity has been expanded; used for
268 * (optionally) limiting number of expansion to guard against
269 * denial-of-service attacks like "Billion Laughs".
270 *
271 * @since 4.3
272 */
273 protected int mEntityExpansionCount;
274
275 /**
276 * Flag that indicates whether linefeeds in the input data are to
277 * be normalized or not.
278 * Xml specs mandate that the line feeds are only normalized
279 * when they are from the external entities (main doc, external
280 * general/parsed entities), so normalization has to be
281 * suppressed when expanding internal general/parsed entities.
282 */
283 protected boolean mNormalizeLFs;
284
285 /**
286 * Flag that indicates whether all escaped chars are accepted in XML 1.0.
287 */
288 protected boolean mXml10AllowAllEscapedChars;
289
290 /*
291 ///////////////////////////////////////////////////////////////////////
292 // Buffer(s) for local name(s) and text content
293 ///////////////////////////////////////////////////////////////////////
294 */
295
296 /**
297 * Temporary buffer used if local name can not be just directly
298 * constructed from input buffer (name is on a boundary or such).
299 */
300 protected char[] mNameBuffer = null;
301
302 /*
303 ///////////////////////////////////////////////////////////////////////
304 // Information about starting location of event
305 // Reader is pointing to; updated on-demand
306 ///////////////////////////////////////////////////////////////////////
307 */
308
309 // // // Location info at point when current token was started
310
311 /**
312 * Total number of characters read before start of current token.
313 * For big (gigabyte-sized) sizes are possible, needs to be long,
314 * unlike pointers and sizes related to in-memory buffers.
315 */
316 protected long mTokenInputTotal = 0;
317
318 /**
319 * Input row on which current token starts, 1-based
320 */
321 protected int mTokenInputRow = 1;
322
323 /**
324 * Column on input row that current token starts; 0-based (although
325 * in the end it'll be converted to 1-based)
326 */
327 protected int mTokenInputCol = 0;
328
329 /*
330 ///////////////////////////////////////////////////////////////////////
331 // XML document information (from doc decl if one was found) common to
332 // all entities (main xml document, external DTD subset)
333 ///////////////////////////////////////////////////////////////////////
334 */
335
336 /**
337 * Input stream encoding, if known (passed in, or determined by
338 * auto-detection); null if not.
339 */
340 protected String mDocInputEncoding = null;
341
342 /**
343 * Character encoding from xml declaration, if any; null if no
344 * declaration, or it didn't specify encoding.
345 */
346 protected String mDocXmlEncoding = null;
347
348 /**
349 * XML version as declared by the document; one of constants
350 * from {@link XmlConsts} (like {@link XmlConsts#XML_V_10}).
351 */
352 protected int mDocXmlVersion = XmlConsts.XML_V_UNKNOWN;
353
354 /**
355 * Cache of internal character entities;
356 */
357 protected Map<String,IntEntity> mCachedEntities;
358
359 /**
360 * Flag for whether or not character references should be treated as entities
361 */
362 protected boolean mCfgTreatCharRefsAsEntities;
363
364 /**
365 * Entity reference stream currently points to.
366 */
367 protected EntityDecl mCurrEntity;
368
369 /*
370 ///////////////////////////////////////////////////////////////////////
371 // Life-cycle
372 ///////////////////////////////////////////////////////////////////////
373 */
374
375 /**
376 * Constructor used when creating a complete new (main-level) reader that
377 * does not share its input buffers or state with another reader.
378 */
379 protected StreamScanner(WstxInputSource input, ReaderConfig cfg,
380 XMLResolver res)
381 {
382 super();
383 mInput = input;
384 // 17-Jun-2004, TSa: Need to know root-level input source
385 mRootInput = input;
386
387 mConfig = cfg;
388 mSymbols = cfg.getSymbols();
389 int cf = cfg.getConfigFlags();
390 mCfgNsEnabled = (cf & CFG_NAMESPACE_AWARE) != 0;
391 mCfgReplaceEntities = (cf & CFG_REPLACE_ENTITY_REFS) != 0;
392
393 // waiting for pull request, see https://github.com/FasterXML/woodstox/pull/56
394 mXml10AllowAllEscapedChars = true;//mConfig.willXml10AllowAllEscapedChars();
395
396 mNormalizeLFs = mConfig.willNormalizeLFs();
397 mInputBuffer = null;
398 mInputPtr = mInputEnd = 0;
399 mEntityResolver = res;
400
401 mCfgTreatCharRefsAsEntities = mConfig.willTreatCharRefsAsEnts();
402 if (mCfgTreatCharRefsAsEntities) {
403 mCachedEntities = new HashMap<String,IntEntity>();
404 } else {
405 mCachedEntities = Collections.emptyMap();
406 }
407 }
408
409 /*
410 ///////////////////////////////////////////////////////////////////////
411 // Package API
412 ///////////////////////////////////////////////////////////////////////
413 */
414
415 /**
416 * Method that returns location of the last character returned by this
417 * reader; that is, location "one less" than the currently pointed to
418 * location.
419 */
420 protected WstxInputLocation getLastCharLocation()
421 {
422 return mInput.getLocation(mCurrInputProcessed + mInputPtr - 1,
423 mCurrInputRow, mInputPtr - mCurrInputRowStart);
424 }
425
426 protected URL getSource() throws IOException {
427 return mInput.getSource();
428 }
429
430 protected String getSystemId() {
431 return mInput.getSystemId();
432 }
433
434 /*
435 ///////////////////////////////////////////////////////////////////////
436 // Partial `LocationInfo` implementation (not implemented
437 // by this base class, but is by some sub-classes)
438 ///////////////////////////////////////////////////////////////////////
439 */
440
441 /**
442 * Returns location of last properly parsed token; as per StAX specs,
443 * apparently needs to be the end of current event, which is the same
444 * as the start of the following event (or EOF if that's next).
445 */
446 @Override
447 public abstract Location getLocation();
448
449 public XMLStreamLocation2 getStartLocation()
450 {
451 // note: +1 is used as columns are 1-based...
452 return mInput.getLocation(mTokenInputTotal,
453 mTokenInputRow, mTokenInputCol + 1);
454 }
455
456 public XMLStreamLocation2 getCurrentLocation()
457 {
458 return mInput.getLocation(mCurrInputProcessed + mInputPtr,
459 mCurrInputRow, mInputPtr - mCurrInputRowStart + 1);
460 }
461
462 /*
463 ///////////////////////////////////////////////////////////////////////
464 // InputProblemReporter implementation
465 ///////////////////////////////////////////////////////////////////////
466 */
467
468 public WstxException throwWfcException(String msg, boolean deferErrors)
469 throws WstxException
470 {
471 WstxException ex = constructWfcException(msg);
472 if (!deferErrors) {
473 throw ex;
474 }
475 return ex;
476 }
477
478 @Override
479 public void throwParseError(String msg) throws XMLStreamException {
480 throwParseError(msg, null, null);
481 }
482
483 /**
484 * Throws generic parse error with specified message and current parsing
485 * location.
486 *<p>
487 * Note: public access only because core code in other packages needs
488 * to access it.
489 */
490 @Override
491 public void throwParseError(String format, Object arg, Object arg2)
492 throws XMLStreamException
493 {
494 String msg = (arg != null || arg2 != null) ?
495 MessageFormat.format(format, new Object[] { arg, arg2 }) : format;
496 throw constructWfcException(msg);
497 }
498
499 public void reportProblem(String probType, String format, Object arg, Object arg2)
500 throws XMLStreamException
501 {
502 XMLReporter rep = mConfig.getXMLReporter();
503 if (rep != null) {
504 _reportProblem(rep, probType,
505 MessageFormat.format(format, new Object[] { arg, arg2 }), null);
506 }
507 }
508
509 @Override
510 public void reportProblem(Location loc, String probType,
511 String format, Object arg, Object arg2)
512 throws XMLStreamException
513 {
514 XMLReporter rep = mConfig.getXMLReporter();
515 if (rep != null) {
516 String msg = (arg != null || arg2 != null) ?
517 MessageFormat.format(format, new Object[] { arg, arg2 }) : format;
518 _reportProblem(rep, probType, msg, loc);
519 }
520 }
521
522 protected void _reportProblem(XMLReporter rep, String probType, String msg, Location loc)
523 throws XMLStreamException
524 {
525 if (loc == null) {
526 loc = getLastCharLocation();
527 }
528 _reportProblem(rep, new XMLValidationProblem(loc, msg, XMLValidationProblem.SEVERITY_ERROR, probType));
529 }
530
531 protected void _reportProblem(XMLReporter rep, XMLValidationProblem prob)
532 throws XMLStreamException
533 {
534 if (rep != null) {
535 Location loc = prob.getLocation();
536 if (loc == null) {
537 loc = getLastCharLocation();
538 prob.setLocation(loc);
539 }
540 // Backwards-compatibility fix: add non-null type, if missing:
541 if (prob.getType() == null) {
542 prob.setType(ErrorConsts.WT_VALIDATION);
543 }
544 // [WSTX-154]: was catching and dropping thrown exception: shouldn't.
545 // [WTSX-157]: need to support XMLReporter2
546 if (rep instanceof XMLReporter2) {
547 ((XMLReporter2) rep).report(prob);
548 } else {
549 rep.report(prob.getMessage(), prob.getType(), prob, loc);
550 }
551 }
552 }
553
554 /**
555 *<p>
556 * Note: this is the base implementation used for implementing
557 * <code>ValidationContext</code>
558 */
559 @Override
560 public void reportValidationProblem(XMLValidationProblem prob)
561 throws XMLStreamException
562 {
563 // !!! TBI: Fail-fast vs. deferred modes?
564 /* For now let's implement basic functionality: warnings get
565 * reported via XMLReporter, errors and fatal errors result in
566 * immediate exceptions.
567 */
568 /* 27-May-2008, TSa: [WSTX-153] Above is incorrect: as per Stax
569 * javadocs for XMLReporter, both warnings and non-fatal errors
570 * (which includes all validation errors) should be reported via
571 * XMLReporter interface, and only fatals should cause an
572 * immediate stream exception (by-passing reporter)
573 */
574 if (prob.getSeverity() > XMLValidationProblem.SEVERITY_ERROR) {
575 throw WstxValidationException.create(prob);
576 }
577 XMLReporter rep = mConfig.getXMLReporter();
578 if (rep != null) {
579 _reportProblem(rep, prob);
580 } else {
581 /* If no reporter, regular non-fatal errors are to be reported
582 * as exceptions as well, for backwards compatibility
583 */
584 if (prob.getSeverity() >= XMLValidationProblem.SEVERITY_ERROR) {
585 throw WstxValidationException.create(prob);
586 }
587 }
588 }
589
590 public void reportValidationProblem(String msg, int severity)
591 throws XMLStreamException
592 {
593 reportValidationProblem(new XMLValidationProblem(getLastCharLocation(),
594 msg, severity));
595 }
596
597 @Override
598 public void reportValidationProblem(String msg)
599 throws XMLStreamException
600 {
601 reportValidationProblem(new XMLValidationProblem(getLastCharLocation(), msg,
602 XMLValidationProblem.SEVERITY_ERROR));
603 }
604
605 public void reportValidationProblem(Location loc, String msg)
606 throws XMLStreamException
607 {
608 reportValidationProblem(new XMLValidationProblem(loc, msg));
609 }
610
611 @Override
612 public void reportValidationProblem(String format, Object arg, Object arg2)
613 throws XMLStreamException
614 {
615 reportValidationProblem(MessageFormat.format(format, new Object[] { arg, arg2 }));
616 }
617
618 /*
619 ///////////////////////////////////////////////////////////////////////
620 // Other error reporting methods
621 ///////////////////////////////////////////////////////////////////////
622 */
623
624 protected WstxException constructWfcException(String msg)
625 {
626 return new WstxParsingException(msg, getLastCharLocation());
627 }
628
629 /**
630 * Construct and return a {@link XMLStreamException} to throw
631 * as a result of a failed Typed Access operation (but one not
632 * caused by a Well-Formedness Constraint or Validation Constraint
633 * problem)
634 */
635 /*
636 protected WstxException _constructTypeException(String msg)
637 {
638 // Hmmh. Should there be a distinct sub-type?
639 return new WstxParsingException(msg, getLastCharLocation());
640 }
641 */
642
643 protected WstxException constructFromIOE(IOException ioe)
644 {
645 return new WstxIOException(ioe);
646 }
647
648 protected WstxException constructNullCharException()
649 {
650 return new WstxUnexpectedCharException("Illegal character (NULL, unicode 0) encountered: not valid in any content",
651 getLastCharLocation(), CHAR_NULL);
652 }
653
654 protected void throwUnexpectedChar(int i, String msg) throws WstxException
655 {
656 char c = (char) i;
657 String excMsg = "Unexpected character "+getCharDesc(c)+msg;
658 throw new WstxUnexpectedCharException(excMsg, getLastCharLocation(), c);
659 }
660
661 protected void throwNullChar() throws WstxException {
662 throw constructNullCharException();
663 }
664
665 protected void throwInvalidSpace(int i) throws WstxException {
666 throwInvalidSpace(i, false);
667 }
668
669 protected WstxException throwInvalidSpace(int i, boolean deferErrors)
670 throws WstxException
671 {
672 char c = (char) i;
673 WstxException ex;
674 if (c == CHAR_NULL) {
675 ex = constructNullCharException();
676 } else {
677 String msg = "Illegal character ("+getCharDesc(c)+")";
678 if (mXml11) {
679 msg += " [note: in XML 1.1, it could be included via entity expansion]";
680 }
681 ex = new WstxUnexpectedCharException(msg, getLastCharLocation(), c);
682 }
683 if (!deferErrors) {
684 throw ex;
685 }
686 return ex;
687 }
688
689 protected void throwUnexpectedEOF(String msg)
690 throws WstxException
691 {
692 throw new WstxEOFException("Unexpected EOF"+(msg == null ? "" : msg),
693 getLastCharLocation());
694 }
695
696 /**
697 * Similar to {@link #throwUnexpectedEOF}, but only indicates ending
698 * of an input block. Used when reading a token that can not span
699 * input block boundaries (ie. can not continue past end of an
700 * entity expansion).
701 */
702 protected void throwUnexpectedEOB(String msg)
703 throws WstxException
704 {
705 throw new WstxEOFException("Unexpected end of input block"+(msg == null ? "" : msg),
706 getLastCharLocation());
707 }
708
709 protected void throwFromIOE(IOException ioe) throws WstxException {
710 throw new WstxIOException(ioe);
711 }
712
713 protected void throwFromStrE(XMLStreamException strex)
714 throws WstxException
715 {
716 if (strex instanceof WstxException) {
717 throw (WstxException) strex;
718 }
719 throw new WstxException(strex);
720 }
721
722 /**
723 * Method called to report an error, when caller's signature only
724 * allows runtime exceptions to be thrown.
725 */
726 protected void throwLazyError(Exception e)
727 {
728 if (e instanceof XMLStreamException) {
729 WstxLazyException.throwLazily((XMLStreamException) e);
730 }
731 ExceptionUtil.throwRuntimeException(e);
732 }
733
734 protected String tokenTypeDesc(int type) {
735 return ErrorConsts.tokenTypeDesc(type);
736 }
737
738 /*
739 ///////////////////////////////////////////////////////////////////////
740 // Input buffer handling
741 ///////////////////////////////////////////////////////////////////////
742 */
743
744 /**
745 * Returns current input source this source uses.
746 *<p>
747 * Note: public only because some implementations are on different
748 * package.
749 */
750 public final WstxInputSource getCurrentInput() {
751 return mInput;
752 }
753
754 protected final int inputInBuffer() {
755 return mInputEnd - mInputPtr;
756 }
757
758 @SuppressWarnings("cast")
759 protected final int getNext() throws XMLStreamException
760 {
761 if (mInputPtr >= mInputEnd) {
762 if (!loadMore()) {
763 return -1;
764 }
765 }
766 return (int) mInputBuffer[mInputPtr++];
767 }
768
769 /**
770 * Similar to {@link #getNext}, but does not advance pointer
771 * in input buffer.
772 *<p>
773 * Note: this method only peeks within current input source;
774 * it does not close it and check nested input source (if any).
775 * This is necessary when checking keywords, since they can never
776 * cross input block boundary.
777 */
778 @SuppressWarnings("cast")
779 protected final int peekNext()
780 throws XMLStreamException
781 {
782 if (mInputPtr >= mInputEnd) {
783 if (!loadMoreFromCurrent()) {
784 return -1;
785 }
786 }
787 return (int) mInputBuffer[mInputPtr];
788 }
789
790 protected final char getNextChar(String errorMsg)
791 throws XMLStreamException
792 {
793 if (mInputPtr >= mInputEnd) {
794 loadMore(errorMsg);
795 }
796 return mInputBuffer[mInputPtr++];
797 }
798
799 /**
800 * Similar to {@link #getNextChar}, but will not read more characters
801 * from parent input source(s) if the current input source doesn't
802 * have more content. This is often needed to prevent "runaway" content,
803 * such as comments that start in an entity but do not have matching
804 * close marker inside entity; XML specification specifically states
805 * such markup is not legal.
806 */
807 protected final char getNextCharFromCurrent(String errorMsg)
808 throws XMLStreamException
809 {
810 if (mInputPtr >= mInputEnd) {
811 loadMoreFromCurrent(errorMsg);
812 }
813 return mInputBuffer[mInputPtr++];
814 }
815
816 /**
817 * Method that will skip through zero or more white space characters,
818 * and return either the character following white space, or -1 to
819 * indicate EOF (end of the outermost input source)/
820 */
821 @SuppressWarnings("cast")
822 protected final int getNextAfterWS()
823 throws XMLStreamException
824 {
825 if (mInputPtr >= mInputEnd) {
826 if (!loadMore()) {
827 return -1;
828 }
829 }
830 char c = mInputBuffer[mInputPtr++];
831 while (c <= CHAR_SPACE) {
832 // Linefeed?
833 if (c == '\n' || c == '\r') {
834 skipCRLF(c);
835 } else if (c != CHAR_SPACE && c != '\t') {
836 throwInvalidSpace(c);
837 }
838 // Still a white space?
839 if (mInputPtr >= mInputEnd) {
840 if (!loadMore()) {
841 return -1;
842 }
843 }
844 c = mInputBuffer[mInputPtr++];
845 }
846 return (int) c;
847 }
848
849 protected final char getNextCharAfterWS(String errorMsg)
850 throws XMLStreamException
851 {
852 if (mInputPtr >= mInputEnd) {
853 loadMore(errorMsg);
854 }
855
856 char c = mInputBuffer[mInputPtr++];
857 while (c <= CHAR_SPACE) {
858 // Linefeed?
859 if (c == '\n' || c == '\r') {
860 skipCRLF(c);
861 } else if (c != CHAR_SPACE && c != '\t') {
862 throwInvalidSpace(c);
863 }
864
865 // Still a white space?
866 if (mInputPtr >= mInputEnd) {
867 loadMore(errorMsg);
868 }
869 c = mInputBuffer[mInputPtr++];
870 }
871 return c;
872 }
873
874 protected final char getNextInCurrAfterWS(String errorMsg)
875 throws XMLStreamException
876 {
877 return getNextInCurrAfterWS(errorMsg, getNextCharFromCurrent(errorMsg));
878 }
879
880 protected final char getNextInCurrAfterWS(String errorMsg, char c)
881 throws XMLStreamException
882 {
883 while (c <= CHAR_SPACE) {
884 // Linefeed?
885 if (c == '\n' || c == '\r') {
886 skipCRLF(c);
887 } else if (c != CHAR_SPACE && c != '\t') {
888 throwInvalidSpace(c);
889 }
890
891 // Still a white space?
892 if (mInputPtr >= mInputEnd) {
893 loadMoreFromCurrent(errorMsg);
894 }
895 c = mInputBuffer[mInputPtr++];
896 }
897 return c;
898 }
899
900 /**
901 * Method called when a CR has been spotted in input; checks if next
902 * char is LF, and if so, skips it. Note that next character has to
903 * come from the current input source, to qualify; it can never come
904 * from another (nested) input source.
905 *
906 * @return True, if passed in char is '\r' and next one is '\n'.
907 */
908 protected final boolean skipCRLF(char c)
909 throws XMLStreamException
910 {
911 boolean result;
912
913 if (c == '\r' && peekNext() == '\n') {
914 ++mInputPtr;
915 result = true;
916 } else {
917 result = false;
918 }
919 ++mCurrInputRow;
920 mCurrInputRowStart = mInputPtr;
921 return result;
922 }
923
924 protected final void markLF() {
925 ++mCurrInputRow;
926 mCurrInputRowStart = mInputPtr;
927 }
928
929 protected final void markLF(int inputPtr) {
930 ++mCurrInputRow;
931 mCurrInputRowStart = inputPtr;
932 }
933
934 /**
935 * Method to push back last character read; can only be called once,
936 * that is, no more than one char can be guaranteed to be succesfully
937 * returned.
938 */
939 protected final void pushback() { --mInputPtr; }
940
941 /*
942 ///////////////////////////////////////////////////////////////////////
943 // Sub-class overridable input handling methods
944 ///////////////////////////////////////////////////////////////////////
945 */
946
947 /**
948 * Method called when an entity has been expanded (new input source
949 * has been created). Needs to initialize location information and change
950 * active input source.
951 *
952 * @param entityId Name of the entity being expanded
953 */
954 protected void initInputSource(WstxInputSource newInput, boolean isExt,
955 String entityId)
956 throws XMLStreamException
957 {
958 // Let's make sure new input will be read next time input is needed:
959 mInputPtr = 0;
960 mInputEnd = 0;
961 /* Plus, reset the input location so that'll be accurate for
962 * error reporting etc.
963 */
964 mInputTopDepth = mCurrDepth;
965
966 // [WSTX-296]: Check for entity expansion depth against configurable limit
967 int entityDepth = mInput.getEntityDepth() + 1;
968 verifyLimit("Maximum entity expansion depth", mConfig.getMaxEntityDepth(), entityDepth);
969 mInput = newInput;
970 mInput.initInputLocation(this, mCurrDepth, entityDepth);
971
972 /* 21-Feb-2006, TSa: Linefeeds are NOT normalized when expanding
973 * internal entities (XML, 2.11)
974 */
975 if (isExt) {
976 mNormalizeLFs = true;
977 } else {
978 mNormalizeLFs = false;
979 }
980 }
981
982 /**
983 * Method that will try to read one or more characters from currently
984 * open input sources; closing input sources if necessary.
985 *
986 * @return true if reading succeeded (or may succeed), false if
987 * we reached EOF.
988 */
989 protected boolean loadMore()
990 throws XMLStreamException
991 {
992 WstxInputSource input = mInput;
993 do {
994 /* Need to make sure offsets are properly updated for error
995 * reporting purposes, and do this now while previous amounts
996 * are still known.
997 */
998 mCurrInputProcessed += mInputEnd;
999 verifyLimit("Maximum document characters", mConfig.getMaxCharacters(), mCurrInputProcessed);
1000 mCurrInputRowStart -= mInputEnd;
1001 int count;
1002 try {
1003 count = input.readInto(this);
1004 if (count > 0) {
1005 return true;
1006 }
1007 input.close();
1008 } catch (IOException ioe) {
1009 throw constructFromIOE(ioe);
1010 }
1011 if (input == mRootInput) {
1012 /* Note: no need to check entity/input nesting in this
1013 * particular case, since it will be handled by higher level
1014 * parsing code (results in an unexpected EOF)
1015 */
1016 return false;
1017 }
1018 WstxInputSource parent = input.getParent();
1019 if (parent == null) { // sanity check!
1020 throwNullParent(input);
1021 }
1022 /* 13-Feb-2006, TSa: Ok, do we violate a proper nesting constraints
1023 * with this input block closure?
1024 */
1025 if (mCurrDepth != input.getScopeId()) {
1026 handleIncompleteEntityProblem(input);
1027 }
1028
1029 mInput = input = parent;
1030 input.restoreContext(this);
1031 mInputTopDepth = input.getScopeId();
1032 /* 21-Feb-2006, TSa: Since linefeed normalization needs to be
1033 * suppressed for internal entity expansion, we may need to
1034 * change the state...
1035 */
1036 if (!mNormalizeLFs) {
1037 mNormalizeLFs = !input.fromInternalEntity();
1038 }
1039 // Maybe there are leftovers from that input in buffer now?
1040 } while (mInputPtr >= mInputEnd);
1041
1042 return true;
1043 }
1044
1045 protected final boolean loadMore(String errorMsg)
1046 throws XMLStreamException
1047 {
1048 if (!loadMore()) {
1049 throwUnexpectedEOF(errorMsg);
1050 }
1051 return true;
1052 }
1053
1054 protected boolean loadMoreFromCurrent()
1055 throws XMLStreamException
1056 {
1057 // Need to update offsets properly
1058 mCurrInputProcessed += mInputEnd;
1059 mCurrInputRowStart -= mInputEnd;
1060 verifyLimit("Maximum document characters", mConfig.getMaxCharacters(), mCurrInputProcessed);
1061 try {
1062 int count = mInput.readInto(this);
1063 return (count > 0);
1064 } catch (IOException ie) {
1065 throw constructFromIOE(ie);
1066 }
1067 }
1068
1069 protected final boolean loadMoreFromCurrent(String errorMsg)
1070 throws XMLStreamException
1071 {
1072 if (!loadMoreFromCurrent()) {
1073 throwUnexpectedEOB(errorMsg);
1074 }
1075 return true;
1076 }
1077
1078 /**
1079 * Method called to make sure current main-level input buffer has at
1080 * least specified number of characters available consequtively,
1081 * without having to call {@link #loadMore}. It can only be called
1082 * when input comes from main-level buffer; further, call can shift
1083 * content in input buffer, so caller has to flush any data still
1084 * pending. In short, caller has to know exactly what it's doing. :-)
1085 *<p>
1086 * Note: method does not check for any other input sources than the
1087 * current one -- if current source can not fulfill the request, a
1088 * failure is indicated.
1089 *
1090 * @return true if there's now enough data; false if not (EOF)
1091 */
1092 protected boolean ensureInput(int minAmount)
1093 throws XMLStreamException
1094 {
1095 int currAmount = mInputEnd - mInputPtr;
1096 if (currAmount >= minAmount) {
1097 return true;
1098 }
1099 try {
1100 return mInput.readMore(this, minAmount);
1101 } catch (IOException ie) {
1102 throw constructFromIOE(ie);
1103 }
1104 }
1105
1106 protected void closeAllInput(boolean force)
1107 throws XMLStreamException
1108 {
1109 WstxInputSource input = mInput;
1110 while (true) {
1111 try {
1112 if (force) {
1113 input.closeCompletely();
1114 } else {
1115 input.close();
1116 }
1117 } catch (IOException ie) {
1118 throw constructFromIOE(ie);
1119 }
1120 if (input == mRootInput) {
1121 break;
1122 }
1123 WstxInputSource parent = input.getParent();
1124 if (parent == null) { // sanity check!
1125 throwNullParent(input);
1126 }
1127 mInput = input = parent;
1128 }
1129 }
1130
1131 /**
1132 * @param curr Input source currently in use
1133 */
1134 protected void throwNullParent(WstxInputSource curr)
1135 {
1136 throw new IllegalStateException(ErrorConsts.ERR_INTERNAL);
1137 //throw new IllegalStateException("Internal error: null parent for input source '"+curr+"'; should never occur (should have stopped at root input '"+mRootInput+"').");
1138 }
1139
1140 /*
1141 ///////////////////////////////////////////////////////////////////////
1142 // Entity resolution
1143 ///////////////////////////////////////////////////////////////////////
1144 */
1145
1146 /**
1147 * Method that tries to resolve a character entity, or (if caller so
1148 * specifies), a pre-defined internal entity (lt, gt, amp, apos, quot).
1149 * It will succeed iff:
1150 * <ol>
1151 * <li>Entity in question is a simple character entity (either one of
1152 * 5 pre-defined ones, or using decimal/hex notation), AND
1153 * <li>
1154 * <li>Entity fits completely inside current input buffer.
1155 * <li>
1156 * </ol>
1157 * If so, character value of entity is returned. Character 0 is returned
1158 * otherwise; if so, caller needs to do full resolution.
1159 *<p>
1160 * Note: On entry we are guaranteed there are at least 3 more characters
1161 * in this buffer; otherwise we shouldn't be called.
1162 *
1163 * @param checkStd If true, will check pre-defined internal entities
1164 * (gt, lt, amp, apos, quot); if false, will only check actual
1165 * character entities.
1166 *
1167 * @return (Valid) character value, if entity is a character reference,
1168 * and could be resolved from current input buffer (does not span
1169 * buffer boundary); null char (code 0) if not (either non-char
1170 * entity, or spans input buffer boundary).
1171 */
1172 protected int resolveSimpleEntity(boolean checkStd)
1173 throws XMLStreamException
1174 {
1175 char[] buf = mInputBuffer;
1176 int ptr = mInputPtr;
1177 char c = buf[ptr++];
1178
1179 // Numeric reference?
1180 if (c == '#') {
1181 c = buf[ptr++];
1182 int value = 0;
1183 int inputLen = mInputEnd;
1184 if (c == 'x') { // hex
1185 while (ptr < inputLen) {
1186 c = buf[ptr++];
1187 if (c == ';') {
1188 break;
1189 }
1190 value = value << 4;
1191 if (c <= '9' && c >= '0') {
1192 value += (c - '0');
1193 } else if (c >= 'a' && c <= 'f') {
1194 value += (10 + (c - 'a'));
1195 } else if (c >= 'A' && c <= 'F') {
1196 value += (10 + (c - 'A'));
1197 } else {
1198 mInputPtr = ptr; // so error points to correct char
1199 throwUnexpectedChar(c, "; expected a hex digit (0-9a-fA-F).");
1200 }
1201 /* Need to check for overflow; easiest to do right as
1202 * it happens...
1203 */
1204 if (value > MAX_UNICODE_CHAR) {
1205 reportUnicodeOverflow();
1206 }
1207 }
1208 } else { // numeric (decimal)
1209 while (c != ';') {
1210 if (c <= '9' && c >= '0') {
1211 value = (value * 10) + (c - '0');
1212 // Overflow?
1213 if (value > MAX_UNICODE_CHAR) {
1214 reportUnicodeOverflow();
1215 }
1216 } else {
1217 mInputPtr = ptr; // so error points to correct char
1218 throwUnexpectedChar(c, "; expected a decimal number.");
1219 }
1220 if (ptr >= inputLen) {
1221 break;
1222 }
1223 c = buf[ptr++];
1224 }
1225 }
1226 /* We get here either if we got it all, OR if we ran out of
1227 * input in current buffer.
1228 */
1229 if (c == ';') { // got the full thing
1230 mInputPtr = ptr;
1231 validateChar(value);
1232 return value;
1233 }
1234
1235 /* If we ran out of input, need to just fall back, gets
1236 * resolved via 'full' resolution mechanism.
1237 */
1238 } else if (checkStd) {
1239 /* Caller may not want to resolve these quite yet...
1240 * (when it wants separate events for non-char entities)
1241 */
1242 if (c == 'a') { // amp or apos?
1243 c = buf[ptr++];
1244
1245 if (c == 'm') { // amp?
1246 if (buf[ptr++] == 'p') {
1247 if (ptr < mInputEnd && buf[ptr++] == ';') {
1248 mInputPtr = ptr;
1249 return '&';
1250 }
1251 }
1252 } else if (c == 'p') { // apos?
1253 if (buf[ptr++] == 'o') {
1254 int len = mInputEnd;
1255 if (ptr < len && buf[ptr++] == 's') {
1256 if (ptr < len && buf[ptr++] == ';') {
1257 mInputPtr = ptr;
1258 return '\'';
1259 }
1260 }
1261 }
1262 }
1263 } else if (c == 'g') { // gt?
1264 if (buf[ptr++] == 't' && buf[ptr++] == ';') {
1265 mInputPtr = ptr;
1266 return '>';
1267 }
1268 } else if (c == 'l') { // lt?
1269 if (buf[ptr++] == 't' && buf[ptr++] == ';') {
1270 mInputPtr = ptr;
1271 return '<';
1272 }
1273 } else if (c == 'q') { // quot?
1274 if (buf[ptr++] == 'u' && buf[ptr++] == 'o') {
1275 int len = mInputEnd;
1276 if (ptr < len && buf[ptr++] == 't') {
1277 if (ptr < len && buf[ptr++] == ';') {
1278 mInputPtr = ptr;
1279 return '"';
1280 }
1281 }
1282 }
1283 }
1284 }
1285 return 0;
1286 }
1287
1288 /**
1289 * Method called to resolve character entities, and only character
1290 * entities (except that pre-defined char entities -- amp, apos, lt,
1291 * gt, quote -- MAY be "char entities" in this sense, depending on
1292 * arguments).
1293 * Otherwise it is to return the null char; if so,
1294 * the input pointer will point to the same point as when method
1295 * entered (char after ampersand), plus the ampersand itself is
1296 * guaranteed to be in the input buffer (so caller can just push it
1297 * back if necessary).
1298 *<p>
1299 * Most often this method is called when reader is not to expand
1300 * non-char entities automatically, but to return them as separate
1301 * events.
1302 *<p>
1303 * Main complication here is that we need to do 5-char lookahead. This
1304 * is problematic if chars are on input buffer boundary. This is ok
1305 * for the root level input buffer, but not for some nested buffers.
1306 * However, according to XML specs, such split entities are actually
1307 * illegal... so we can throw an exception in those cases.
1308 *
1309 * @param checkStd If true, will check pre-defined internal entities
1310 * (gt, lt, amp, apos, quot) as character entities; if false, will only
1311 * check actual 'real' character entities.
1312 *
1313 * @return (Valid) character value, if entity is a character reference,
1314 * and could be resolved from current input buffer (does not span
1315 * buffer boundary); null char (code 0) if not (either non-char
1316 * entity, or spans input buffer boundary).
1317 */
1318 protected int resolveCharOnlyEntity(boolean checkStd)
1319 throws XMLStreamException
1320 {
1321 //int avail = inputInBuffer();
1322 int avail = mInputEnd - mInputPtr;
1323 if (avail < 6) {
1324 // split entity, or buffer boundary
1325 /* Don't want to lose leading '&' (in case we can not expand
1326 * the entity), so let's push it back first
1327 */
1328 --mInputPtr;
1329 /* Shortest valid reference would be 3 chars ('&a;'); which
1330 * would only be legal from an expanded entity...
1331 */
1332 if (!ensureInput(6)) {
1333 avail = inputInBuffer();
1334 if (avail < 3) {
1335 throwUnexpectedEOF(SUFFIX_IN_ENTITY_REF);
1336 }
1337 } else {
1338 avail = 6;
1339 }
1340 // ... and now we can move pointer back as well:
1341 ++mInputPtr;
1342 }
1343
1344 /* Ok, now we have one more character to check, and that's enough
1345 * to determine type decisively.
1346 */
1347 char c = mInputBuffer[mInputPtr];
1348
1349 // A char reference?
1350 if (c == '#') { // yup
1351 ++mInputPtr;
1352 return resolveCharEnt(null);
1353 }
1354
1355 // nope... except may be a pre-def?
1356 if (checkStd) {
1357 if (c == 'a') {
1358 char d = mInputBuffer[mInputPtr+1];
1359 if (d == 'm') {
1360 if (avail >= 4
1361 && mInputBuffer[mInputPtr+2] == 'p'
1362 && mInputBuffer[mInputPtr+3] == ';') {
1363 mInputPtr += 4;
1364 return '&';
1365 }
1366 } else if (d == 'p') {
1367 if (avail >= 5
1368 && mInputBuffer[mInputPtr+2] == 'o'
1369 && mInputBuffer[mInputPtr+3] == 's'
1370 && mInputBuffer[mInputPtr+4] == ';') {
1371 mInputPtr += 5;
1372 return '\'';
1373 }
1374 }
1375 } else if (c == 'l') {
1376 if (avail >= 3
1377 && mInputBuffer[mInputPtr+1] == 't'
1378 && mInputBuffer[mInputPtr+2] == ';') {
1379 mInputPtr += 3;
1380 return '<';
1381 }
1382 } else if (c == 'g') {
1383 if (avail >= 3
1384 && mInputBuffer[mInputPtr+1] == 't'
1385 && mInputBuffer[mInputPtr+2] == ';') {
1386 mInputPtr += 3;
1387 return '>';
1388 }
1389 } else if (c == 'q') {
1390 if (avail >= 5
1391 && mInputBuffer[mInputPtr+1] == 'u'
1392 && mInputBuffer[mInputPtr+2] == 'o'
1393 && mInputBuffer[mInputPtr+3] == 't'
1394 && mInputBuffer[mInputPtr+4] == ';') {
1395 mInputPtr += 5;
1396 return '"';
1397 }
1398 }
1399 }
1400 return 0;
1401 }
1402
1403 /**
1404 * Reverse of {@link #resolveCharOnlyEntity}; will only resolve entity
1405 * if it is NOT a character entity (or pre-defined 'generic' entity;
1406 * amp, apos, lt, gt or quot). Only used in cases where entities
1407 * are to be separately returned unexpanded (in non-entity-replacing
1408 * mode); which means it's never called from dtd handler.
1409 */
1410 protected EntityDecl resolveNonCharEntity()
1411 throws XMLStreamException
1412 {
1413 //int avail = inputInBuffer();
1414 int avail = mInputEnd - mInputPtr;
1415 if (avail < 6) {
1416 // split entity, or buffer boundary
1417 /* Don't want to lose leading '&' (in case we can not expand
1418 * the entity), so let's push it back first
1419 */
1420 --mInputPtr;
1421
1422 /* Shortest valid reference would be 3 chars ('&a;'); which
1423 * would only be legal from an expanded entity...
1424 */
1425 if (!ensureInput(6)) {
1426 avail = inputInBuffer();
1427 if (avail < 3) {
1428 throwUnexpectedEOF(SUFFIX_IN_ENTITY_REF);
1429 }
1430 } else {
1431 avail = 6;
1432 }
1433 // ... and now we can move pointer back as well:
1434 ++mInputPtr;
1435 }
1436
1437 // We don't care about char entities:
1438 char c = mInputBuffer[mInputPtr];
1439 if (c == '#') {
1440 return null;
1441 }
1442
1443 /* 19-Aug-2004, TSa: Need special handling for pre-defined
1444 * entities; they are not counted as 'real' general parsed
1445 * entities, but more as character entities...
1446 */
1447
1448 // have chars at least up to mInputPtr+4 by now
1449 if (c == 'a') {
1450 char d = mInputBuffer[mInputPtr+1];
1451 if (d == 'm') {
1452 if (avail >= 4
1453 && mInputBuffer[mInputPtr+2] == 'p'
1454 && mInputBuffer[mInputPtr+3] == ';') {
1455 // If not automatically expanding:
1456 //return sEntityAmp;
1457 // mInputPtr += 4;
1458 return null;
1459 }
1460 } else if (d == 'p') {
1461 if (avail >= 5
1462 && mInputBuffer[mInputPtr+2] == 'o'
1463 && mInputBuffer[mInputPtr+3] == 's'
1464 && mInputBuffer[mInputPtr+4] == ';') {
1465 return null;
1466 }
1467 }
1468 } else if (c == 'l') {
1469 if (avail >= 3
1470 && mInputBuffer[mInputPtr+1] == 't'
1471 && mInputBuffer[mInputPtr+2] == ';') {
1472 return null;
1473 }
1474 } else if (c == 'g') {
1475 if (avail >= 3
1476 && mInputBuffer[mInputPtr+1] == 't'
1477 && mInputBuffer[mInputPtr+2] == ';') {
1478 return null;
1479 }
1480 } else if (c == 'q') {
1481 if (avail >= 5
1482 && mInputBuffer[mInputPtr+1] == 'u'
1483 && mInputBuffer[mInputPtr+2] == 'o'
1484 && mInputBuffer[mInputPtr+3] == 't'
1485 && mInputBuffer[mInputPtr+4] == ';') {
1486 return null;
1487 }
1488 }
1489
1490 // Otherwise, let's just parse in generic way:
1491 ++mInputPtr; // since we already read the first letter
1492 String id = parseEntityName(c);
1493 mCurrName = id;
1494
1495 return findEntity(id, null);
1496 }
1497
1498 /**
1499 * Method that does full resolution of an entity reference, be it
1500 * character entity, internal entity or external entity, including
1501 * updating of input buffers, and depending on whether result is
1502 * a character entity (or one of 5 pre-defined entities), returns
1503 * char in question, or null character (code 0) to indicate it had
1504 * to change input source.
1505 *
1506 * @param allowExt If true, is allowed to expand external entities
1507 * (expanding text); if false, is not (expanding attribute value).
1508 *
1509 * @return Either single-character replacement (which is NOT to be
1510 * reparsed), or null char (0) to indicate expansion is done via
1511 * input source.
1512 */
1513 protected int fullyResolveEntity(boolean allowExt)
1514 throws XMLStreamException
1515 {
1516 char c = getNextCharFromCurrent(SUFFIX_IN_ENTITY_REF);
1517 // Do we have a (numeric) character entity reference?
1518 if (c == '#') { // numeric
1519 final StringBuffer originalSurface = new StringBuffer("#");
1520 int ch = resolveCharEnt(originalSurface);
1521 if (mCfgTreatCharRefsAsEntities) {
1522 final char[] originalChars = new char[originalSurface.length()];
1523 originalSurface.getChars(0, originalSurface.length(), originalChars, 0);
1524 mCurrEntity = getIntEntity(ch, originalChars);
1525 return 0;
1526 }
1527 return ch;
1528 }
1529
1530 String id = parseEntityName(c);
1531
1532 // Perhaps we have a pre-defined char reference?
1533 c = id.charAt(0);
1534 /*
1535 * 16-May-2004, TSa: Should custom entities (or ones defined in int/ext subset) override
1536 * pre-defined settings for these?
1537 */
1538 char d = CHAR_NULL;
1539 if (c == 'a') { // amp or apos?
1540 if (id.equals("amp")) {
1541 d = '&';
1542 } else if (id.equals("apos")) {
1543 d = '\'';
1544 }
1545 } else if (c == 'g') { // gt?
1546 if (id.length() == 2 && id.charAt(1) == 't') {
1547 d = '>';
1548 }
1549 } else if (c == 'l') { // lt?
1550 if (id.length() == 2 && id.charAt(1) == 't') {
1551 d = '<';
1552 }
1553 } else if (c == 'q') { // quot?
1554 if (id.equals("quot")) {
1555 d = '"';
1556 }
1557 }
1558
1559 if (d != CHAR_NULL) {
1560 if (mCfgTreatCharRefsAsEntities) {
1561 final char[] originalChars = new char[id.length()];
1562 id.getChars(0, id.length(), originalChars, 0);
1563 mCurrEntity = getIntEntity(d, originalChars);
1564 return 0;
1565 }
1566 return d;
1567 }
1568
1569 final EntityDecl e = expandEntity(id, allowExt, null);
1570 if (mCfgTreatCharRefsAsEntities) {
1571 mCurrEntity = e;
1572 }
1573 return 0;
1574 }
1575
1576 /**
1577 * Returns an entity (possibly from cache) for the argument character using the encoded
1578 * representation in mInputBuffer[entityStartPos ... mInputPtr-1].
1579 */
1580 protected EntityDecl getIntEntity(int ch, final char[] originalChars)
1581 {
1582 String cacheKey = new String(originalChars);
1583
1584 IntEntity entity = mCachedEntities.get(cacheKey);
1585 if (entity == null) {
1586 String repl;
1587 if (ch <= 0xFFFF) {
1588 repl = Character.toString((char) ch);
1589 } else {
1590 StringBuffer sb = new StringBuffer(2);
1591 ch -= 0x10000;
1592 sb.append((char) ((ch >> 10) + 0xD800));
1593 sb.append((char) ((ch & 0x3FF) + 0xDC00));
1594 repl = sb.toString();
1595 }
1596 entity = IntEntity.create(new String(originalChars), repl);
1597 mCachedEntities.put(cacheKey, entity);
1598 }
1599 return entity;
1600 }
1601
1602
1603 /**
1604 * Helper method that will try to expand a parsed entity (parameter or
1605 * generic entity).
1606 *<p>
1607 * note: called by sub-classes (dtd parser), needs to be protected.
1608 *
1609 * @param id Name of the entity being expanded
1610 * @param allowExt Whether external entities can be expanded or not; if
1611 * not, and the entity to expand would be external one, an exception
1612 * will be thrown
1613 */
1614 protected EntityDecl expandEntity(String id, boolean allowExt,
1615 Object extraArg)
1616 throws XMLStreamException
1617 {
1618 mCurrName = id;
1619
1620 EntityDecl ed = findEntity(id, extraArg);
1621
1622 if (ed == null) {
1623 /* 30-Sep-2005, TSa: As per [WSTX-5], let's only throw exception
1624 * if we have to resolve it (otherwise it's just best-effort,
1625 * and null is ok)
1626 */
1627 /* 02-Oct-2005, TSa: Plus, [WSTX-4] adds "undeclared entity
1628 * resolver"
1629 */
1630 if (mCfgReplaceEntities) {
1631 mCurrEntity = expandUnresolvedEntity(id);
1632 }
1633 return null;
1634 }
1635
1636 if (!mCfgTreatCharRefsAsEntities || this instanceof MinimalDTDReader) {
1637 expandEntity(ed, allowExt);
1638 }
1639
1640 return ed;
1641 }
1642
1643 /**
1644 *<p>
1645 * note: defined as private for documentation, ie. it's just called
1646 * from within this class (not sub-classes), from one specific method
1647 * (see above)
1648 *
1649 * @param ed Entity to be expanded
1650 * @param allowExt Whether external entities are allowed or not.
1651 */
1652 private void expandEntity(EntityDecl ed, boolean allowExt)
1653 throws XMLStreamException
1654 {
1655 String id = ed.getName();
1656
1657 /* Very first thing; we can immediately check if expanding
1658 * this entity would result in infinite recursion:
1659 */
1660 if (mInput.isOrIsExpandedFrom(id)) {
1661 throwRecursionError(id);
1662 }
1663
1664 /* Should not refer unparsed entities from attribute values
1665 * or text content (except via notation mechanism, but that's
1666 * not parsed here)
1667 */
1668 if (!ed.isParsed()) {
1669 throwParseError("Illegal reference to unparsed external entity \"{0}\"", id, null);
1670 }
1671
1672 // 28-Jun-2004, TSa: Do we support external entity expansion?
1673 boolean isExt = ed.isExternal();
1674 if (isExt) {
1675 if (!allowExt) { // never ok in attribute value...
1676 throwParseError("Encountered a reference to external parsed entity \"{0}\" when expanding attribute value: not legal as per XML 1.0/1.1 #3.1", id, null);
1677 }
1678 if (!mConfig.willSupportExternalEntities()) {
1679 throwParseError("Encountered a reference to external entity \"{0}\", but stream reader has feature \"{1}\" disabled",
1680 id, XMLInputFactory.IS_SUPPORTING_EXTERNAL_ENTITIES);
1681 }
1682 }
1683 verifyLimit("Maximum entity expansion count", mConfig.getMaxEntityCount(), ++mEntityExpansionCount);
1684 // First, let's give current context chance to save its stuff
1685 WstxInputSource oldInput = mInput;
1686 oldInput.saveContext(this);
1687 WstxInputSource newInput = null;
1688 try {
1689 newInput = ed.expand(oldInput, mEntityResolver, mConfig, mDocXmlVersion);
1690 } catch (FileNotFoundException fex) {
1691 /* Let's catch and rethrow this just so we get more meaningful
1692 * description (with input source position etc)
1693 */
1694 throwParseError("(was {0}) {1}", fex.getClass().getName(), fex.getMessage());
1695 } catch (IOException ioe) {
1696 throw constructFromIOE(ioe);
1697 }
1698 /* And then we'll need to make sure new input comes from the new
1699 * input source
1700 */
1701 initInputSource(newInput, isExt, id);
1702 }
1703
1704 /**
1705 *<p>
1706 * note: only called from the local expandEntity() method
1707 */
1708 private EntityDecl expandUnresolvedEntity(String id)
1709 throws XMLStreamException
1710 {
1711 XMLResolver resolver = mConfig.getUndeclaredEntityResolver();
1712 if (resolver != null) {
1713 /* Ok, we can check for recursion here; but let's only do that
1714 * if there is any chance that it might get resolved by
1715 * the special resolver (it must have been resolved this way
1716 * earlier, too...)
1717 */
1718 if (mInput.isOrIsExpandedFrom(id)) {
1719 throwRecursionError(id);
1720 }
1721
1722 WstxInputSource oldInput = mInput;
1723 oldInput.saveContext(this);
1724 // null, null -> no public or system ids
1725 int xmlVersion = mDocXmlVersion;
1726 // 05-Feb-2006, TSa: If xmlVersion not explicitly known, defaults to 1.0
1727 if (xmlVersion == XmlConsts.XML_V_UNKNOWN) {
1728 xmlVersion = XmlConsts.XML_V_10;
1729 }
1730 WstxInputSource newInput;
1731 try {
1732 newInput = DefaultInputResolver.resolveEntityUsing
1733 (oldInput, id, null, null, resolver, mConfig, xmlVersion);
1734 if (mCfgTreatCharRefsAsEntities) {
1735 return new IntEntity(WstxInputLocation.getEmptyLocation(), newInput.getEntityId(),
1736 newInput.getSource(), new char[]{}, WstxInputLocation.getEmptyLocation());
1737 }
1738 } catch (IOException ioe) {
1739 throw constructFromIOE(ioe);
1740 }
1741 if (newInput != null) {
1742 // true -> is external
1743 initInputSource(newInput, true, id);
1744 return null;
1745 }
1746 }
1747 handleUndeclaredEntity(id);
1748 return null;
1749 }
1750
1751 /*
1752 ///////////////////////////////////////////////////////////////////////
1753 // Abstract methods for sub-classes to implement
1754 ///////////////////////////////////////////////////////////////////////
1755 */
1756
1757 /**
1758 * Abstract method for sub-classes to implement, for finding
1759 * a declared general or parsed entity.
1760 *
1761 * @param id Identifier of the entity to find
1762 * @param arg Optional argument passed from caller; needed by DTD
1763 * reader.
1764 */
1765 protected abstract EntityDecl findEntity(String id, Object arg)
1766 throws XMLStreamException;
1767
1768 /**
1769 * This method gets called if a declaration for an entity was not
1770 * found in entity expanding mode (enabled by default for xml reader,
1771 * always enabled for dtd reader).
1772 */
1773 protected abstract void handleUndeclaredEntity(String id)
1774 throws XMLStreamException;
1775
1776 protected abstract void handleIncompleteEntityProblem(WstxInputSource closing)
1777 throws XMLStreamException;
1778
1779 /*
1780 ///////////////////////////////////////////////////////////////////////
1781 // Basic tokenization
1782 ///////////////////////////////////////////////////////////////////////
1783 */
1784
1785 /**
1786 * Method that will parse name token (roughly equivalent to XML specs;
1787 * although bit lenier for more efficient handling); either uri prefix,
1788 * or local name.
1789 *<p>
1790 * Much of complexity in this method has to do with the intention to
1791 * try to avoid any character copies. In this optimal case algorithm
1792 * would be fairly simple. However, this only works if all data is
1793 * already in input buffer... if not, copy has to be made halfway
1794 * through parsing, and that complicates things.
1795 *<p>
1796 * One thing to note is that String returned has been canonicalized
1797 * and (if necessary) added to symbol table. It can thus be compared
1798 * against other such (usually id) Strings, with simple equality operator.
1799 *
1800 * @param c First character of the name; not yet checked for validity
1801 *
1802 * @return Canonicalized name String (which may have length 0, if
1803 * EOF or non-name-start char encountered)
1804 */
1805 protected String parseLocalName(char c)
1806 throws XMLStreamException
1807 {
1808 /* Has to start with letter, or '_' (etc); we won't allow ':' as that
1809 * is taken as namespace separator; no use trying to optimize
1810 * heavily as it's 98% likely it is a valid char...
1811 */
1812 if (!isNameStartChar(c)) {
1813 if (c == ':') {
1814 throwUnexpectedChar(c, " (missing namespace prefix?)");
1815 }
1816 throwUnexpectedChar(c, " (expected a name start character)");
1817 }
1818
1819 int ptr = mInputPtr;
1820 int hash = c;
1821 final int inputLen = mInputEnd;
1822 int startPtr = ptr-1; // already read previous char
1823 final char[] inputBuf = mInputBuffer;
1824
1825 /* After which there may be zero or more name chars
1826 * we have to consider
1827 */
1828 while (true) {
1829 if (ptr >= inputLen) {
1830 /* Ok, identifier may continue past buffer end, need
1831 * to continue with part 2 (separate method, as this is
1832 * not as common as having it all in buffer)
1833 */
1834 mInputPtr = ptr;
1835 return parseLocalName2(startPtr, hash);
1836 }
1837 // Ok, we have the char... is it a name char?
1838 c = inputBuf[ptr];
1839 if (c < CHAR_LOWEST_LEGAL_LOCALNAME_CHAR) {
1840 break;
1841 }
1842 if (!isNameChar(c)) {
1843 break;
1844 }
1845 hash = (hash * 31) + c;
1846 ++ptr;
1847 }
1848 mInputPtr = ptr;
1849 return mSymbols.findSymbol(mInputBuffer, startPtr, ptr - startPtr, hash);
1850 }
1851
1852 /**
1853 * Second part of name token parsing; called when name can continue
1854 * past input buffer end (so only part was read before calling this
1855 * method to read the rest).
1856 *<p>
1857 * Note that this isn't heavily optimized, on assumption it's not
1858 * called very often.
1859 */
1860 protected String parseLocalName2(int start, int hash)
1861 throws XMLStreamException
1862 {
1863 int ptr = mInputEnd - start;
1864 // Let's assume fairly short names
1865 char[] outBuf = getNameBuffer(ptr+8);
1866
1867 if (ptr > 0) {
1868 System.arraycopy(mInputBuffer, start, outBuf, 0, ptr);
1869 }
1870
1871 int outLen = outBuf.length;
1872 while (true) {
1873 // note: names can not cross input block (entity) boundaries...
1874 if (mInputPtr >= mInputEnd) {
1875 if (!loadMoreFromCurrent()) {
1876 break;
1877 }
1878 }
1879 char c = mInputBuffer[mInputPtr];
1880 if (c < CHAR_LOWEST_LEGAL_LOCALNAME_CHAR) {
1881 break;
1882 }
1883 if (!isNameChar(c)) {
1884 break;
1885 }
1886 ++mInputPtr;
1887 if (ptr >= outLen) {
1888 mNameBuffer = outBuf = expandBy50Pct(outBuf);
1889 outLen = outBuf.length;
1890 }
1891 outBuf[ptr++] = c;
1892 hash = (hash * 31) + c;
1893 }
1894 // Still need to canonicalize the name:
1895 return mSymbols.findSymbol(outBuf, 0, ptr, hash);
1896 }
1897
1898 /**
1899 * Method that will parse 'full' name token; what full means depends on
1900 * whether reader is namespace aware or not. If it is, full name means
1901 * local name with no namespace prefix (PI target, entity/notation name);
1902 * if not, name can contain arbitrary number of colons. Note that
1903 * element and attribute names are NOT parsed here, so actual namespace
1904 * prefix separation can be handled properly there.
1905 *<p>
1906 * Similar to {@link #parseLocalName}, much of complexity stems from
1907 * trying to avoid copying name characters from input buffer.
1908 *<p>
1909 * Note that returned String will be canonicalized, similar to
1910 * {@link #parseLocalName}, but without separating prefix/local name.
1911 *
1912 * @return Canonicalized name String (which may have length 0, if
1913 * EOF or non-name-start char encountered)
1914 */
1915 protected String parseFullName()
1916 throws XMLStreamException
1917 {
1918 if (mInputPtr >= mInputEnd) {
1919 loadMoreFromCurrent();
1920 }
1921 return parseFullName(mInputBuffer[mInputPtr++]);
1922 }
1923
1924 protected String parseFullName(char c)
1925 throws XMLStreamException
1926 {
1927 // First char has special handling:
1928 if (!isNameStartChar(c)) {
1929 if (c == ':') { // no name.... generally an error:
1930 if (mCfgNsEnabled) {
1931 throwNsColonException(parseFNameForError());
1932 }
1933 // Ok, that's fine actually
1934 } else {
1935 if (c <= CHAR_SPACE) {
1936 throwUnexpectedChar(c, " (missing name?)");
1937 }
1938 throwUnexpectedChar(c, " (expected a name start character)");
1939 }
1940 }
1941
1942 int ptr = mInputPtr;
1943 int hash = c;
1944 int inputLen = mInputEnd;
1945 int startPtr = ptr-1; // to account for the first char
1946
1947 /* After which there may be zero or more name chars
1948 * we have to consider
1949 */
1950 while (true) {
1951 if (ptr >= inputLen) {
1952 /* Ok, identifier may continue past buffer end, need
1953 * to continue with part 2 (separate method, as this is
1954 * not as common as having it all in buffer)
1955 */
1956 mInputPtr = ptr;
1957 return parseFullName2(startPtr, hash);
1958 }
1959 c = mInputBuffer[ptr];
1960 if (c == ':') { // colon only allowed in non-NS mode
1961 if (mCfgNsEnabled) {
1962 mInputPtr = ptr;
1963 throwNsColonException(new String(mInputBuffer, startPtr, ptr - startPtr) + parseFNameForError());
1964 }
1965 } else {
1966 if (c < CHAR_LOWEST_LEGAL_LOCALNAME_CHAR) {
1967 break;
1968 }
1969 if (!isNameChar(c)) {
1970 break;
1971 }
1972 }
1973 hash = (hash * 31) + c;
1974 ++ptr;
1975 }
1976 mInputPtr = ptr;
1977 return mSymbols.findSymbol(mInputBuffer, startPtr, ptr - startPtr, hash);
1978 }
1979
1980 @SuppressWarnings("cast")
1981 protected String parseFullName2(int start, int hash)
1982 throws XMLStreamException
1983 {
1984 int ptr = mInputEnd - start;
1985 // Let's assume fairly short names
1986 char[] outBuf = getNameBuffer(ptr+8);
1987
1988 if (ptr > 0) {
1989 System.arraycopy(mInputBuffer, start, outBuf, 0, ptr);
1990 }
1991
1992 int outLen = outBuf.length;
1993 while (true) {
1994 /* 06-Sep-2004, TSa: Name tokens are not allowed to continue
1995 * past entity expansion ranges... that is, all characters
1996 * have to come from the same input source. Thus, let's only
1997 * load things from same input level
1998 */
1999 if (mInputPtr >= mInputEnd) {
2000 if (!loadMoreFromCurrent()) {
2001 break;
2002 }
2003 }
2004 char c = mInputBuffer[mInputPtr];
2005 if (c == ':') { // colon only allowed in non-NS mode
2006 if (mCfgNsEnabled) {
2007 throwNsColonException(new String(outBuf, 0, ptr) + c + parseFNameForError());
2008 }
2009 } else if (c < CHAR_LOWEST_LEGAL_LOCALNAME_CHAR) {
2010 break;
2011 } else if (!isNameChar(c)) {
2012 break;
2013 }
2014 ++mInputPtr;
2015
2016 if (ptr >= outLen) {
2017 mNameBuffer = outBuf = expandBy50Pct(outBuf);
2018 outLen = outBuf.length;
2019 }
2020 outBuf[ptr++] = c;
2021 hash = (hash * 31) + (int) c;
2022 }
2023
2024 // Still need to canonicalize the name:
2025 return mSymbols.findSymbol(outBuf, 0, ptr, hash);
2026 }
2027
2028 /**
2029 * Method called to read in full name, including unlimited number of
2030 * namespace separators (':'), for the purpose of displaying name in
2031 * an error message. Won't do any further validations, and parsing
2032 * is not optimized: main need is just to get more meaningful error
2033 * messages.
2034 */
2035 protected String parseFNameForError()
2036 throws XMLStreamException
2037 {
2038 StringBuilder sb = new StringBuilder(100);
2039 while (true) {
2040 char c;
2041
2042 if (mInputPtr < mInputEnd) {
2043 c = mInputBuffer[mInputPtr++];
2044 } else { // can't error here, so let's accept EOF for now:
2045 int i = getNext();
2046 if (i < 0) {
2047 break;
2048 }
2049 c = (char) i;
2050 }
2051 if (c != ':' && !isNameChar(c)) {
2052 --mInputPtr;
2053 break;
2054 }
2055 sb.append(c);
2056 }
2057 return sb.toString();
2058 }
2059
2060 protected final String parseEntityName(char c)
2061 throws XMLStreamException
2062 {
2063 String id = parseFullName(c);
2064 // Needs to be followed by a semi-colon, too.. from same input source:
2065 if (mInputPtr >= mInputEnd) {
2066 if (!loadMoreFromCurrent()) {
2067 throwParseError("Missing semicolon after reference for entity \"{0}\"", id, null);
2068 }
2069 }
2070 c = mInputBuffer[mInputPtr++];
2071 if (c != ';') {
2072 throwUnexpectedChar(c, "; expected a semi-colon after the reference for entity '"+id+"'");
2073 }
2074 return id;
2075 }
2076
2077 /**
2078 * Note: does not check for number of colons, amongst other things.
2079 * Main idea is to skip through what superficially seems like a valid
2080 * id, nothing more. This is only done when really skipping through
2081 * something we do not care about at all: not even whether names/ids
2082 * would be valid (for example, when ignoring internal DTD subset).
2083 *
2084 * @return Length of skipped name.
2085 */
2086 protected int skipFullName(char c)
2087 throws XMLStreamException
2088 {
2089 if (!isNameStartChar(c)) {
2090 --mInputPtr;
2091 return 0;
2092 }
2093
2094 /* After which there may be zero or more name chars
2095 * we have to consider
2096 */
2097 int count = 1;
2098 while (true) {
2099 c = (mInputPtr < mInputEnd) ?
2100 mInputBuffer[mInputPtr++] : getNextChar(SUFFIX_EOF_EXP_NAME);
2101 if (c != ':' && !isNameChar(c)) {
2102 break;
2103 }
2104 ++count;
2105 }
2106 return count;
2107 }
2108
2109 /**
2110 * Simple parsing method that parses system ids, which are generally
2111 * used in entities (from DOCTYPE declaration to internal/external
2112 * subsets).
2113 *<p>
2114 * NOTE: returned String is not canonicalized, on assumption that
2115 * external ids may be longish, and are not shared all that often, as
2116 * they are generally just used for resolving paths, if anything.
2117 *<br />
2118 * Also note that this method is not heavily optimized, as it's not
2119 * likely to be a bottleneck for parsing.
2120 */
2121 protected final String parseSystemId(char quoteChar, boolean convertLFs,
2122 String errorMsg)
2123 throws XMLStreamException
2124 {
2125 char[] buf = getNameBuffer(-1);
2126 int ptr = 0;
2127
2128 while (true) {
2129 char c = (mInputPtr < mInputEnd) ?
2130 mInputBuffer[mInputPtr++] : getNextChar(errorMsg);
2131 if (c == quoteChar) {
2132 break;
2133 }
2134 /* ??? 14-Jun-2004, TSa: Should we normalize linefeeds or not?
2135 * It seems like we should, for all input... so that's the way it
2136 * works.
2137 */
2138 if (c == '\n') {
2139 markLF();
2140 } else if (c == '\r') {
2141 if (peekNext() == '\n') {
2142 ++mInputPtr;
2143 if (!convertLFs) {
2144 /* The only tricky thing; need to preserve 2-char LF; need to
2145 * output one char from here, then can fall back to default:
2146 */
2147 if (ptr >= buf.length) {
2148 buf = expandBy50Pct(buf);
2149 }
2150 buf[ptr++] = '\r';
2151 }
2152 c = '\n';
2153 } else if (convertLFs) {
2154 c = '\n';
2155 }
2156 }
2157
2158 // Other than that, let's just append it:
2159 if (ptr >= buf.length) {
2160 buf = expandBy50Pct(buf);
2161 }
2162 buf[ptr++] = c;
2163 }
2164
2165 return (ptr == 0) ? "" : new String(buf, 0, ptr);
2166 }
2167
2168 /**
2169 * Simple parsing method that parses system ids, which are generally
2170 * used in entities (from DOCTYPE declaration to internal/external
2171 * subsets).
2172 *<p>
2173 * As per xml specs, the contents are actually normalized.
2174 *<p>
2175 * NOTE: returned String is not canonicalized, on assumption that
2176 * external ids may be longish, and are not shared all that often, as
2177 * they are generally just used for resolving paths, if anything.
2178 *<br />
2179 * Also note that this method is not heavily optimized, as it's not
2180 * likely to be a bottleneck for parsing.
2181 */
2182 protected final String parsePublicId(char quoteChar, String errorMsg)
2183 throws XMLStreamException
2184 {
2185 char[] buf = getNameBuffer(-1);
2186 int ptr = 0;
2187 boolean spaceToAdd = false;
2188
2189 while (true) {
2190 char c = (mInputPtr < mInputEnd) ?
2191 mInputBuffer[mInputPtr++] : getNextChar(errorMsg);
2192 if (c == quoteChar) {
2193 break;
2194 }
2195 if (c == '\n') {
2196 markLF();
2197 spaceToAdd = true;
2198 continue;
2199 } else if (c == '\r') {
2200 if (peekNext() == '\n') {
2201 ++mInputPtr;
2202 }
2203 spaceToAdd = true;
2204 continue;
2205 } else if (c == CHAR_SPACE) {
2206 spaceToAdd = true;
2207 continue;
2208 } else {
2209 // Verify it's a legal pubid char (see XML spec, #13, from 2.3)
2210 if ((c >= VALID_PUBID_CHAR_COUNT)
2211 || sPubidValidity[c] != PUBID_CHAR_VALID_B) {
2212 throwUnexpectedChar(c, " in public identifier");
2213 }
2214 }
2215
2216 // Other than that, let's just append it:
2217 if (ptr >= buf.length) {
2218 buf = expandBy50Pct(buf);
2219 }
2220 /* Space-normalization means scrapping leading and trailing
2221 * white space, and coalescing remaining ws into single spaces.
2222 */
2223 if (spaceToAdd) { // pending white space to add?
2224 if (c == CHAR_SPACE) { // still a space; let's skip
2225 continue;
2226 }
2227 /* ok: if we have non-space, we'll either forget about
2228 * space(s) (if nothing has been output, ie. leading space),
2229 * or output a single space (in-between non-white space)
2230 */
2231 spaceToAdd = false;
2232 if (ptr > 0) {
2233 buf[ptr++] = CHAR_SPACE;
2234 if (ptr >= buf.length) {
2235 buf = expandBy50Pct(buf);
2236 }
2237 }
2238 }
2239 buf[ptr++] = c;
2240 }
2241
2242 return (ptr == 0) ? "" : new String(buf, 0, ptr);
2243 }
2244
2245 protected final void parseUntil(TextBuffer tb, char endChar, boolean convertLFs,
2246 String errorMsg)
2247 throws XMLStreamException
2248 {
2249 // Let's first ensure we have some data in there...
2250 if (mInputPtr >= mInputEnd) {
2251 loadMore(errorMsg);
2252 }
2253 while (true) {
2254 // Let's loop consequtive 'easy' spans:
2255 char[] inputBuf = mInputBuffer;
2256 int inputLen = mInputEnd;
2257 int ptr = mInputPtr;
2258 int startPtr = ptr;
2259 while (ptr < inputLen) {
2260 char c = inputBuf[ptr++];
2261 if (c == endChar) {
2262 int thisLen = ptr - startPtr - 1;
2263 if (thisLen > 0) {
2264 tb.append(inputBuf, startPtr, thisLen);
2265 }
2266 mInputPtr = ptr;
2267 return;
2268 }
2269 if (c == '\n') {
2270 mInputPtr = ptr; // markLF() requires this
2271 markLF();
2272 } else if (c == '\r') {
2273 if (!convertLFs && ptr < inputLen) {
2274 if (inputBuf[ptr] == '\n') {
2275 ++ptr;
2276 }
2277 mInputPtr = ptr;
2278 markLF();
2279 } else {
2280 int thisLen = ptr - startPtr - 1;
2281 if (thisLen > 0) {
2282 tb.append(inputBuf, startPtr, thisLen);
2283 }
2284 mInputPtr = ptr;
2285 c = getNextChar(errorMsg);
2286 if (c != '\n') {
2287 --mInputPtr; // pusback
2288 tb.append(convertLFs ? '\n' : '\r');
2289 } else {
2290 if (convertLFs) {
2291 tb.append('\n');
2292 } else {
2293 tb.append('\r');
2294 tb.append('\n');
2295 }
2296 }
2297 startPtr = ptr = mInputPtr;
2298 markLF();
2299 }
2300 }
2301 }
2302 int thisLen = ptr - startPtr;
2303 if (thisLen > 0) {
2304 tb.append(inputBuf, startPtr, thisLen);
2305 }
2306 loadMore(errorMsg);
2307 startPtr = ptr = mInputPtr;
2308 inputBuf = mInputBuffer;
2309 inputLen = mInputEnd;
2310 }
2311 }
2312
2313 /*
2314 ///////////////////////////////////////////////////////////////////////
2315 // Internal methods
2316 ///////////////////////////////////////////////////////////////////////
2317 */
2318
2319 private int resolveCharEnt(StringBuffer originalCharacters)
2320 throws XMLStreamException
2321 {
2322 int value = 0;
2323 char c = getNextChar(SUFFIX_IN_ENTITY_REF);
2324
2325 if (originalCharacters != null) {
2326 originalCharacters.append(c);
2327 }
2328
2329 if (c == 'x') { // hex
2330 while (true) {
2331 c = (mInputPtr < mInputEnd) ? mInputBuffer[mInputPtr++]
2332 : getNextCharFromCurrent(SUFFIX_IN_ENTITY_REF);
2333 if (c == ';') {
2334 break;
2335 }
2336
2337 if (originalCharacters != null) {
2338 originalCharacters.append(c);
2339 }
2340 value = value << 4;
2341 if (c <= '9' && c >= '0') {
2342 value += (c - '0');
2343 } else if (c >= 'a' && c <= 'f') {
2344 value += 10 + (c - 'a');
2345 } else if (c >= 'A' && c <= 'F') {
2346 value += 10 + (c - 'A');
2347 } else {
2348 throwUnexpectedChar(c, "; expected a hex digit (0-9a-fA-F).");
2349 }
2350 // Overflow?
2351 if (value > MAX_UNICODE_CHAR) {
2352 reportUnicodeOverflow();
2353 }
2354 }
2355 } else { // numeric (decimal)
2356 while (c != ';') {
2357 if (c <= '9' && c >= '0') {
2358 value = (value * 10) + (c - '0');
2359 // Overflow?
2360 if (value > MAX_UNICODE_CHAR) {
2361 reportUnicodeOverflow();
2362 }
2363 } else {
2364 throwUnexpectedChar(c, "; expected a decimal number.");
2365 }
2366 c = (mInputPtr < mInputEnd) ? mInputBuffer[mInputPtr++]
2367 : getNextCharFromCurrent(SUFFIX_IN_ENTITY_REF);
2368
2369 if (originalCharacters != null && c != ';') {
2370 originalCharacters.append(c);
2371 }
2372 }
2373 }
2374 validateChar(value);
2375 return value;
2376 }
2377
2378 /**
2379 * Method that will verify that expanded Unicode codepoint is a valid
2380 * XML content character.
2381 */
2382 private final void validateChar(int value)
2383 throws XMLStreamException
2384 {
2385 /* 24-Jan-2006, TSa: Ok, "high" Unicode chars are problematic,
2386 * need to be reported by a surrogate pair..
2387 */
2388 if (value >= 0xD800) {
2389 if (value < 0xE000) { // no surrogates via entity expansion
2390 reportIllegalChar(value);
2391 }
2392 if (value > 0xFFFF) {
2393 // Within valid range at all?
2394 if (value > MAX_UNICODE_CHAR) {
2395 reportUnicodeOverflow();
2396 }
2397 } else if (value >= 0xFFFE) { // 0xFFFE and 0xFFFF are illegal too
2398 reportIllegalChar(value);
2399 }
2400 // Ok, fine as is
2401 } else if (value < 32) {
2402 if (value == 0) {
2403 throwParseError("Invalid character reference: null character not allowed in XML content.");
2404 }
2405 // XML 1.1 allows most other chars; 1.0 does not:
2406 if (!mXml10AllowAllEscapedChars) {
2407 if (!mXml11 &&
2408 (value != 0x9 && value != 0xA && value != 0xD)) {
2409 reportIllegalChar(value);
2410 }
2411 }
2412 }
2413 }
2414
2415 protected final char[] getNameBuffer(int minSize)
2416 {
2417 char[] buf = mNameBuffer;
2418
2419 if (buf == null) {
2420 mNameBuffer = buf = new char[(minSize > 48) ? (minSize+16) : 64];
2421 } else if (minSize >= buf.length) { // let's allow one char extra...
2422 int len = buf.length;
2423 len += (len >> 1); // grow by 50%
2424 mNameBuffer = buf = new char[(minSize >= len) ? (minSize+16) : len];
2425 }
2426 return buf;
2427 }
2428
2429 protected final char[] expandBy50Pct(char[] buf)
2430 {
2431 int len = buf.length;
2432 char[] newBuf = new char[len + (len >> 1)];
2433 System.arraycopy(buf, 0, newBuf, 0, len);
2434 return newBuf;
2435 }
2436
2437 /**
2438 * Method called to throw an exception indicating that a name that
2439 * should not be namespace-qualified (PI target, entity/notation name)
2440 * is one, and reader is namespace aware.
2441 */
2442 private void throwNsColonException(String name)
2443 throws XMLStreamException
2444 {
2445 throwParseError("Illegal name \"{0}\" (PI target, entity/notation name): can not contain a colon (XML Namespaces 1.0#6)", name, null);
2446 }
2447
2448 private void throwRecursionError(String entityName)
2449 throws XMLStreamException
2450 {
2451 throwParseError("Illegal entity expansion: entity \"{0}\" expands itself recursively.", entityName, null);
2452 }
2453
2454 private void reportUnicodeOverflow()
2455 throws XMLStreamException
2456 {
2457 throwParseError("Illegal character entity: value higher than max allowed (0x{0})", Integer.toHexString(MAX_UNICODE_CHAR), null);
2458 }
2459
2460 private void reportIllegalChar(int value)
2461 throws XMLStreamException
2462 {
2463 throwParseError("Illegal character entity: expansion character (code 0x{0}", Integer.toHexString(value), null);
2464 }
2465
2466 protected void verifyLimit(String type, long maxValue, long currentValue)
2467 throws XMLStreamException
2468 {
2469 if (currentValue > maxValue) {
2470 throw constructLimitViolation(type, maxValue);
2471 }
2472 }
2473
2474 protected XMLStreamException constructLimitViolation(String type, long limit)
2475 throws XMLStreamException
2476 {
2477 return new XMLStreamException(type+" limit ("+limit+") exceeded");
2478 }
2479 }