gitweb.ps.run Git - toc/blob - antlr4-cpp-runtime-4.9.2-source/runtime/src/DefaultErrorStrategy.h

   1 /* Copyright (c) 2012-2017 The ANTLR Project. All rights reserved.
   2  * Use of this file is governed by the BSD 3-clause license that
   3  * can be found in the LICENSE.txt file in the project root.
   4  */
   5
   6 #pragma once
   7
   8 #include "ANTLRErrorStrategy.h"
   9 #include "misc/IntervalSet.h"
  10
  11 namespace antlr4 {
  12
  13   /**
  14    * This is the default implementation of {@link ANTLRErrorStrategy} used for
  15    * error reporting and recovery in ANTLR parsers.
  16    */
  17   class ANTLR4CPP_PUBLIC DefaultErrorStrategy : public ANTLRErrorStrategy {
  18   public:
  19     DefaultErrorStrategy();
  20     DefaultErrorStrategy(DefaultErrorStrategy const& other) = delete;
  21     virtual ~DefaultErrorStrategy();
  22
  23     DefaultErrorStrategy& operator = (DefaultErrorStrategy const& other) = delete;
  24
  25   protected:
  26     /**
  27      * Indicates whether the error strategy is currently "recovering from an
  28      * error". This is used to suppress reporting multiple error messages while
  29      * attempting to recover from a detected syntax error.
  30      *
  31      * @see #inErrorRecoveryMode
  32      */
  33     bool errorRecoveryMode;
  34
  35     /** The index into the input stream where the last error occurred.
  36      *  This is used to prevent infinite loops where an error is found
  37      *  but no token is consumed during recovery...another error is found,
  38      *  ad nauseum.  This is a failsafe mechanism to guarantee that at least
  39      *  one token/tree node is consumed for two errors.
  40      */
  41     int lastErrorIndex;
  42
  43     misc::IntervalSet lastErrorStates;
  44
  45     /// <summary>
  46     /// {@inheritDoc}
  47     /// <p/>
  48     /// The default implementation simply calls <seealso cref="#endErrorCondition"/> to
  49     /// ensure that the handler is not in error recovery mode.
  50     /// </summary>
  51   public:
  52     virtual void reset(Parser *recognizer) override;
  53
  54     /// <summary>
  55     /// This method is called to enter error recovery mode when a recognition
  56     /// exception is reported.
  57     /// </summary>
  58     /// <param name="recognizer"> the parser instance </param>
  59   protected:
  60     virtual void beginErrorCondition(Parser *recognizer);
  61
  62     /// <summary>
  63     /// {@inheritDoc}
  64     /// </summary>
  65   public:
  66     virtual bool inErrorRecoveryMode(Parser *recognizer) override;
  67
  68     /// <summary>
  69     /// This method is called to leave error recovery mode after recovering from
  70     /// a recognition exception.
  71     /// </summary>
  72     /// <param name="recognizer"> </param>
  73   protected:
  74     virtual void endErrorCondition(Parser *recognizer);
  75
  76     /// <summary>
  77     /// {@inheritDoc}
  78     /// <p/>
  79     /// The default implementation simply calls <seealso cref="#endErrorCondition"/>.
  80     /// </summary>
  81   public:
  82     virtual void reportMatch(Parser *recognizer) override;
  83
  84     /// {@inheritDoc}
  85     /// <p/>
  86     /// The default implementation returns immediately if the handler is already
  87     /// in error recovery mode. Otherwise, it calls <seealso cref="#beginErrorCondition"/>
  88     /// and dispatches the reporting task based on the runtime type of {@code e}
  89     /// according to the following table.
  90     ///
  91     /// <ul>
  92     /// <li><seealso cref="NoViableAltException"/>: Dispatches the call to
  93     /// <seealso cref="#reportNoViableAlternative"/></li>
  94     /// <li><seealso cref="InputMismatchException"/>: Dispatches the call to
  95     /// <seealso cref="#reportInputMismatch"/></li>
  96     /// <li><seealso cref="FailedPredicateException"/>: Dispatches the call to
  97     /// <seealso cref="#reportFailedPredicate"/></li>
  98     /// <li>All other types: calls <seealso cref="Parser#notifyErrorListeners"/> to report
  99     /// the exception</li>
 100     /// </ul>
 101     virtual void reportError(Parser *recognizer, const RecognitionException &e) override;
 102
 103     /// <summary>
 104     /// {@inheritDoc}
 105     /// <p/>
 106     /// The default implementation resynchronizes the parser by consuming tokens
 107     /// until we find one in the resynchronization set--loosely the set of tokens
 108     /// that can follow the current rule.
 109     /// </summary>
 110     virtual void recover(Parser *recognizer, std::exception_ptr e) override;
 111
 112     /**
 113      * The default implementation of {@link ANTLRErrorStrategy#sync} makes sure
 114      * that the current lookahead symbol is consistent with what were expecting
 115      * at this point in the ATN. You can call this anytime but ANTLR only
 116      * generates code to check before subrules/loops and each iteration.
 117      *
 118      * <p>Implements Jim Idle's magic sync mechanism in closures and optional
 119      * subrules. E.g.,</p>
 120      *
 121      * <pre>
 122      * a : sync ( stuff sync )* ;
 123      * sync : {consume to what can follow sync} ;
 124      * </pre>
 125      *
 126      * At the start of a sub rule upon error, {@link #sync} performs single
 127      * token deletion, if possible. If it can't do that, it bails on the current
 128      * rule and uses the default error recovery, which consumes until the
 129      * resynchronization set of the current rule.
 130      *
 131      * <p>If the sub rule is optional ({@code (...)?}, {@code (...)*}, or block
 132      * with an empty alternative), then the expected set includes what follows
 133      * the subrule.</p>
 134      *
 135      * <p>During loop iteration, it consumes until it sees a token that can start a
 136      * sub rule or what follows loop. Yes, that is pretty aggressive. We opt to
 137      * stay in the loop as long as possible.</p>
 138      *
 139      * <p><strong>ORIGINS</strong></p>
 140      *
 141      * <p>Previous versions of ANTLR did a poor job of their recovery within loops.
 142      * A single mismatch token or missing token would force the parser to bail
 143      * out of the entire rules surrounding the loop. So, for rule</p>
 144      *
 145      * <pre>
 146      * classDef : 'class' ID '{' member* '}'
 147      * </pre>
 148      *
 149      * input with an extra token between members would force the parser to
 150      * consume until it found the next class definition rather than the next
 151      * member definition of the current class.
 152      *
 153      * <p>This functionality cost a little bit of effort because the parser has to
 154      * compare token set at the start of the loop and at each iteration. If for
 155      * some reason speed is suffering for you, you can turn off this
 156      * functionality by simply overriding this method as a blank { }.</p>
 157      */
 158     virtual void sync(Parser *recognizer) override;
 159
 160     /// <summary>
 161     /// This is called by <seealso cref="#reportError"/> when the exception is a
 162     /// <seealso cref="NoViableAltException"/>.
 163     /// </summary>
 164     /// <seealso cref= #reportError
 165     /// </seealso>
 166     /// <param name="recognizer"> the parser instance </param>
 167     /// <param name="e"> the recognition exception </param>
 168   protected:
 169     virtual void reportNoViableAlternative(Parser *recognizer, const NoViableAltException &e);
 170
 171     /// <summary>
 172     /// This is called by <seealso cref="#reportError"/> when the exception is an
 173     /// <seealso cref="InputMismatchException"/>.
 174     /// </summary>
 175     /// <seealso cref= #reportError
 176     /// </seealso>
 177     /// <param name="recognizer"> the parser instance </param>
 178     /// <param name="e"> the recognition exception </param>
 179     virtual void reportInputMismatch(Parser *recognizer, const InputMismatchException &e);
 180
 181     /// <summary>
 182     /// This is called by <seealso cref="#reportError"/> when the exception is a
 183     /// <seealso cref="FailedPredicateException"/>.
 184     /// </summary>
 185     /// <seealso cref= #reportError
 186     /// </seealso>
 187     /// <param name="recognizer"> the parser instance </param>
 188     /// <param name="e"> the recognition exception </param>
 189     virtual void reportFailedPredicate(Parser *recognizer, const FailedPredicateException &e);
 190
 191     /**
 192      * This method is called to report a syntax error which requires the removal
 193      * of a token from the input stream. At the time this method is called, the
 194      * erroneous symbol is current {@code LT(1)} symbol and has not yet been
 195      * removed from the input stream. When this method returns,
 196      * {@code recognizer} is in error recovery mode.
 197      *
 198      * <p>This method is called when {@link #singleTokenDeletion} identifies
 199      * single-token deletion as a viable recovery strategy for a mismatched
 200      * input error.</p>
 201      *
 202      * <p>The default implementation simply returns if the handler is already in
 203      * error recovery mode. Otherwise, it calls {@link #beginErrorCondition} to
 204      * enter error recovery mode, followed by calling
 205      * {@link Parser#notifyErrorListeners}.</p>
 206      *
 207      * @param recognizer the parser instance
 208      */
 209     virtual void reportUnwantedToken(Parser *recognizer);
 210
 211     /**
 212      * This method is called to report a syntax error which requires the
 213      * insertion of a missing token into the input stream. At the time this
 214      * method is called, the missing token has not yet been inserted. When this
 215      * method returns, {@code recognizer} is in error recovery mode.
 216      *
 217      * <p>This method is called when {@link #singleTokenInsertion} identifies
 218      * single-token insertion as a viable recovery strategy for a mismatched
 219      * input error.</p>
 220      *
 221      * <p>The default implementation simply returns if the handler is already in
 222      * error recovery mode. Otherwise, it calls {@link #beginErrorCondition} to
 223      * enter error recovery mode, followed by calling
 224      * {@link Parser#notifyErrorListeners}.</p>
 225      *
 226      * @param recognizer the parser instance
 227      */
 228     virtual void reportMissingToken(Parser *recognizer);
 229
 230   public:
 231     /**
 232      * {@inheritDoc}
 233      *
 234      * <p>The default implementation attempts to recover from the mismatched input
 235      * by using single token insertion and deletion as described below. If the
 236      * recovery attempt fails, this method throws an
 237      * {@link InputMismatchException}.</p>
 238      *
 239      * <p><strong>EXTRA TOKEN</strong> (single token deletion)</p>
 240      *
 241      * <p>{@code LA(1)} is not what we are looking for. If {@code LA(2)} has the
 242      * right token, however, then assume {@code LA(1)} is some extra spurious
 243      * token and delete it. Then consume and return the next token (which was
 244      * the {@code LA(2)} token) as the successful result of the match operation.</p>
 245      *
 246      * <p>This recovery strategy is implemented by {@link #singleTokenDeletion}.</p>
 247      *
 248      * <p><strong>MISSING TOKEN</strong> (single token insertion)</p>
 249      *
 250      * <p>If current token (at {@code LA(1)}) is consistent with what could come
 251      * after the expected {@code LA(1)} token, then assume the token is missing
 252      * and use the parser's {@link TokenFactory} to create it on the fly. The
 253      * "insertion" is performed by returning the created token as the successful
 254      * result of the match operation.</p>
 255      *
 256      * <p>This recovery strategy is implemented by {@link #singleTokenInsertion}.</p>
 257      *
 258      * <p><strong>EXAMPLE</strong></p>
 259      *
 260      * <p>For example, Input {@code i=(3;} is clearly missing the {@code ')'}. When
 261      * the parser returns from the nested call to {@code expr}, it will have
 262      * call chain:</p>
 263      *
 264      * <pre>
 265      * stat &rarr; expr &rarr; atom
 266      * </pre>
 267      *
 268      * and it will be trying to match the {@code ')'} at this point in the
 269      * derivation:
 270      *
 271      * <pre>
 272      * =&gt; ID '=' '(' INT ')' ('+' atom)* ';'
 273      *                    ^
 274      * </pre>
 275      *
 276      * The attempt to match {@code ')'} will fail when it sees {@code ';'} and
 277      * call {@link #recoverInline}. To recover, it sees that {@code LA(1)==';'}
 278      * is in the set of tokens that can follow the {@code ')'} token reference
 279      * in rule {@code atom}. It can assume that you forgot the {@code ')'}.
 280      */
 281     virtual Token* recoverInline(Parser *recognizer) override;
 282
 283     /// <summary>
 284     /// This method implements the single-token insertion inline error recovery
 285     /// strategy. It is called by <seealso cref="#recoverInline"/> if the single-token
 286     /// deletion strategy fails to recover from the mismatched input. If this
 287     /// method returns {@code true}, {@code recognizer} will be in error recovery
 288     /// mode.
 289     /// <p/>
 290     /// This method determines whether or not single-token insertion is viable by
 291     /// checking if the {@code LA(1)} input symbol could be successfully matched
 292     /// if it were instead the {@code LA(2)} symbol. If this method returns
 293     /// {@code true}, the caller is responsible for creating and inserting a
 294     /// token with the correct type to produce this behavior.
 295     /// </summary>
 296     /// <param name="recognizer"> the parser instance </param>
 297     /// <returns> {@code true} if single-token insertion is a viable recovery
 298     /// strategy for the current mismatched input, otherwise {@code false} </returns>
 299   protected:
 300     virtual bool singleTokenInsertion(Parser *recognizer);
 301
 302     /// <summary>
 303     /// This method implements the single-token deletion inline error recovery
 304     /// strategy. It is called by <seealso cref="#recoverInline"/> to attempt to recover
 305     /// from mismatched input. If this method returns null, the parser and error
 306     /// handler state will not have changed. If this method returns non-null,
 307     /// {@code recognizer} will <em>not</em> be in error recovery mode since the
 308     /// returned token was a successful match.
 309     /// <p/>
 310     /// If the single-token deletion is successful, this method calls
 311     /// <seealso cref="#reportUnwantedToken"/> to report the error, followed by
 312     /// <seealso cref="Parser#consume"/> to actually "delete" the extraneous token. Then,
 313     /// before returning <seealso cref="#reportMatch"/> is called to signal a successful
 314     /// match.
 315     /// </summary>
 316     /// <param name="recognizer"> the parser instance </param>
 317     /// <returns> the successfully matched <seealso cref="Token"/> instance if single-token
 318     /// deletion successfully recovers from the mismatched input, otherwise
 319     /// {@code null} </returns>
 320     virtual Token* singleTokenDeletion(Parser *recognizer);
 321
 322     /// <summary>
 323     /// Conjure up a missing token during error recovery.
 324     ///
 325     ///  The recognizer attempts to recover from single missing
 326     ///  symbols. But, actions might refer to that missing symbol.
 327     ///  For example, x=ID {f($x);}. The action clearly assumes
 328     ///  that there has been an identifier matched previously and that
 329     ///  $x points at that token. If that token is missing, but
 330     ///  the next token in the stream is what we want we assume that
 331     ///  this token is missing and we keep going. Because we
 332     ///  have to return some token to replace the missing token,
 333     ///  we have to conjure one up. This method gives the user control
 334     ///  over the tokens returned for missing tokens. Mostly,
 335     ///  you will want to create something special for identifier
 336     ///  tokens. For literals such as '{' and ',', the default
 337     ///  action in the parser or tree parser works. It simply creates
 338     ///  a CommonToken of the appropriate type. The text will be the token.
 339     ///  If you change what tokens must be created by the lexer,
 340     ///  override this method to create the appropriate tokens.
 341     /// </summary>
 342     virtual Token* getMissingSymbol(Parser *recognizer);
 343
 344     virtual misc::IntervalSet getExpectedTokens(Parser *recognizer);
 345
 346     /// <summary>
 347     /// How should a token be displayed in an error message? The default
 348     ///  is to display just the text, but during development you might
 349     ///  want to have a lot of information spit out.  Override in that case
 350     ///  to use t.toString() (which, for CommonToken, dumps everything about
 351     ///  the token). This is better than forcing you to override a method in
 352     ///  your token objects because you don't have to go modify your lexer
 353     ///  so that it creates a new class.
 354     /// </summary>
 355     virtual std::string getTokenErrorDisplay(Token *t);
 356
 357     virtual std::string getSymbolText(Token *symbol);
 358
 359     virtual size_t getSymbolType(Token *symbol);
 360
 361     virtual std::string escapeWSAndQuote(const std::string &s) const;
 362
 363     /*  Compute the error recovery set for the current rule.  During
 364      *  rule invocation, the parser pushes the set of tokens that can
 365      *  follow that rule reference on the stack; this amounts to
 366      *  computing FIRST of what follows the rule reference in the
 367      *  enclosing rule. See LinearApproximator.FIRST().
 368      *  This local follow set only includes tokens
 369      *  from within the rule; i.e., the FIRST computation done by
 370      *  ANTLR stops at the end of a rule.
 371      *
 372      *  EXAMPLE
 373      *
 374      *  When you find a "no viable alt exception", the input is not
 375      *  consistent with any of the alternatives for rule r.  The best
 376      *  thing to do is to consume tokens until you see something that
 377      *  can legally follow a call to r *or* any rule that called r.
 378      *  You don't want the exact set of viable next tokens because the
 379      *  input might just be missing a token--you might consume the
 380      *  rest of the input looking for one of the missing tokens.
 381      *
 382      *  Consider grammar:
 383      *
 384      *  a : '[' b ']'
 385      *    | '(' b ')'
 386      *    ;
 387      *  b : c '^' INT ;
 388      *  c : ID
 389      *    | INT
 390      *    ;
 391      *
 392      *  At each rule invocation, the set of tokens that could follow
 393      *  that rule is pushed on a stack.  Here are the various
 394      *  context-sensitive follow sets:
 395      *
 396      *  FOLLOW(b1_in_a) = FIRST(']') = ']'
 397      *  FOLLOW(b2_in_a) = FIRST(')') = ')'
 398      *  FOLLOW(c_in_b) = FIRST('^') = '^'
 399      *
 400      *  Upon erroneous input "[]", the call chain is
 401      *
 402      *  a -> b -> c
 403      *
 404      *  and, hence, the follow context stack is:
 405      *
 406      *  depth     follow set       start of rule execution
 407      *    0         <EOF>                    a (from main())
 408      *    1          ']'                     b
 409      *    2          '^'                     c
 410      *
 411      *  Notice that ')' is not included, because b would have to have
 412      *  been called from a different context in rule a for ')' to be
 413      *  included.
 414      *
 415      *  For error recovery, we cannot consider FOLLOW(c)
 416      *  (context-sensitive or otherwise).  We need the combined set of
 417      *  all context-sensitive FOLLOW sets--the set of all tokens that
 418      *  could follow any reference in the call chain.  We need to
 419      *  resync to one of those tokens.  Note that FOLLOW(c)='^' and if
 420      *  we resync'd to that token, we'd consume until EOF.  We need to
 421      *  sync to context-sensitive FOLLOWs for a, b, and c: {']','^'}.
 422      *  In this case, for input "[]", LA(1) is ']' and in the set, so we would
 423      *  not consume anything. After printing an error, rule c would
 424      *  return normally.  Rule b would not find the required '^' though.
 425      *  At this point, it gets a mismatched token error and throws an
 426      *  exception (since LA(1) is not in the viable following token
 427      *  set).  The rule exception handler tries to recover, but finds
 428      *  the same recovery set and doesn't consume anything.  Rule b
 429      *  exits normally returning to rule a.  Now it finds the ']' (and
 430      *  with the successful match exits errorRecovery mode).
 431      *
 432      *  So, you can see that the parser walks up the call chain looking
 433      *  for the token that was a member of the recovery set.
 434      *
 435      *  Errors are not generated in errorRecovery mode.
 436      *
 437      *  ANTLR's error recovery mechanism is based upon original ideas:
 438      *
 439      *  "Algorithms + Data Structures = Programs" by Niklaus Wirth
 440      *
 441      *  and
 442      *
 443      *  "A note on error recovery in recursive descent parsers":
 444      *  http://portal.acm.org/citation.cfm?id=947902.947905
 445      *
 446      *  Later, Josef Grosch had some good ideas:
 447      *
 448      *  "Efficient and Comfortable Error Recovery in Recursive Descent
 449      *  Parsers":
 450      *  ftp://www.cocolab.com/products/cocktail/doca4.ps/ell.ps.zip
 451      *
 452      *  Like Grosch I implement context-sensitive FOLLOW sets that are combined
 453      *  at run-time upon error to avoid overhead during parsing.
 454      */
 455     virtual misc::IntervalSet getErrorRecoverySet(Parser *recognizer);
 456
 457     /// <summary>
 458     /// Consume tokens until one matches the given token set. </summary>
 459     virtual void consumeUntil(Parser *recognizer, const misc::IntervalSet &set);
 460
 461   private:
 462     std::vector<std::unique_ptr<Token>> _errorSymbols; // Temporarily created token.
 463     void InitializeInstanceFields();
 464   };
 465
 466 } // namespace antlr4