1 /* Copyright (c) 2012-2017 The ANTLR Project. All rights reserved.
2 * Use of this file is governed by the BSD 3-clause license that
3 * can be found in the LICENSE.txt file in the project root.
7 #include "atn/OrderedATNConfigSet.h"
9 #include "LexerNoViableAltException.h"
10 #include "atn/RuleStopState.h"
11 #include "atn/RuleTransition.h"
12 #include "atn/SingletonPredictionContext.h"
13 #include "atn/PredicateTransition.h"
14 #include "atn/ActionTransition.h"
15 #include "atn/TokensStartState.h"
16 #include "misc/Interval.h"
20 #include "dfa/DFAState.h"
21 #include "atn/LexerATNConfig.h"
22 #include "atn/LexerActionExecutor.h"
23 #include "atn/EmptyPredictionContext.h"
25 #include "atn/LexerATNSimulator.h"
30 using namespace antlr4;
31 using namespace antlr4::atn;
32 using namespace antlrcpp;
34 LexerATNSimulator::SimState::~SimState() {
37 void LexerATNSimulator::SimState::reset() {
38 index = INVALID_INDEX;
40 charPos = INVALID_INDEX;
41 dfaState = nullptr; // Don't delete. It's just a reference.
44 void LexerATNSimulator::SimState::InitializeInstanceFields() {
45 index = INVALID_INDEX;
47 charPos = INVALID_INDEX;
50 int LexerATNSimulator::match_calls = 0;
53 LexerATNSimulator::LexerATNSimulator(const ATN &atn, std::vector<dfa::DFA> &decisionToDFA,
54 PredictionContextCache &sharedContextCache)
55 : LexerATNSimulator(nullptr, atn, decisionToDFA, sharedContextCache) {
58 LexerATNSimulator::LexerATNSimulator(Lexer *recog, const ATN &atn, std::vector<dfa::DFA> &decisionToDFA,
59 PredictionContextCache &sharedContextCache)
60 : ATNSimulator(atn, sharedContextCache), _recog(recog), _decisionToDFA(decisionToDFA) {
61 InitializeInstanceFields();
64 void LexerATNSimulator::copyState(LexerATNSimulator *simulator) {
65 _charPositionInLine = simulator->_charPositionInLine;
66 _line = simulator->_line;
67 _mode = simulator->_mode;
68 _startIndex = simulator->_startIndex;
71 size_t LexerATNSimulator::match(CharStream *input, size_t mode) {
74 ssize_t mark = input->mark();
76 auto onExit = finally([input, mark] {
80 _startIndex = input->index();
82 const dfa::DFA &dfa = _decisionToDFA[mode];
83 if (dfa.s0 == nullptr) {
84 return matchATN(input);
86 return execATN(input, dfa.s0);
90 void LexerATNSimulator::reset() {
94 _charPositionInLine = 0;
95 _mode = Lexer::DEFAULT_MODE;
98 void LexerATNSimulator::clearDFA() {
99 size_t size = _decisionToDFA.size();
100 _decisionToDFA.clear();
101 for (size_t d = 0; d < size; ++d) {
102 _decisionToDFA.emplace_back(atn.getDecisionState(d), d);
106 size_t LexerATNSimulator::matchATN(CharStream *input) {
107 ATNState *startState = atn.modeToStartState[_mode];
109 std::unique_ptr<ATNConfigSet> s0_closure = computeStartState(input, startState);
111 bool suppressEdge = s0_closure->hasSemanticContext;
112 s0_closure->hasSemanticContext = false;
114 dfa::DFAState *next = addDFAState(s0_closure.release());
116 _decisionToDFA[_mode].s0 = next;
119 size_t predict = execATN(input, next);
124 size_t LexerATNSimulator::execATN(CharStream *input, dfa::DFAState *ds0) {
125 if (ds0->isAcceptState) {
126 // allow zero-length tokens
127 // ml: in Java code this method uses 3 params. The first is a member var of the class anyway (_prevAccept), so why pass it here?
128 captureSimState(input, ds0);
131 size_t t = input->LA(1);
132 dfa::DFAState *s = ds0; // s is current/from DFA state
134 while (true) { // while more work
135 // As we move src->trg, src->trg, we keep track of the previous trg to
136 // avoid looking up the DFA state again, which is expensive.
137 // If the previous target was already part of the DFA, we might
138 // be able to avoid doing a reach operation upon t. If s!=null,
139 // it means that semantic predicates didn't prevent us from
140 // creating a DFA state. Once we know s!=null, we check to see if
141 // the DFA state has an edge already for t. If so, we can just reuse
142 // it's configuration set; there's no point in re-computing it.
143 // This is kind of like doing DFA simulation within the ATN
144 // simulation because DFA simulation is really just a way to avoid
145 // computing reach/closure sets. Technically, once we know that
146 // we have a previously added DFA state, we could jump over to
147 // the DFA simulator. But, that would mean popping back and forth
148 // a lot and making things more complicated algorithmically.
149 // This optimization makes a lot of sense for loops within DFA.
150 // A character will take us back to an existing DFA state
151 // that already has lots of edges out of it. e.g., .* in comments.
152 dfa::DFAState *target = getExistingTargetState(s, t);
153 if (target == nullptr) {
154 target = computeTargetState(input, s, t);
157 if (target == ERROR.get()) {
161 // If this is a consumable input element, make sure to consume before
162 // capturing the accept state so the input index, line, and char
163 // position accurately reflect the state of the interpreter at the
165 if (t != Token::EOF) {
169 if (target->isAcceptState) {
170 captureSimState(input, target);
171 if (t == Token::EOF) {
177 s = target; // flip; current DFA target becomes new src/from state
180 return failOrAccept(input, s->configs.get(), t);
183 dfa::DFAState *LexerATNSimulator::getExistingTargetState(dfa::DFAState *s, size_t t) {
184 dfa::DFAState* retval = nullptr;
185 _edgeLock.readLock();
186 if (t <= MAX_DFA_EDGE) {
187 auto iterator = s->edges.find(t - MIN_DFA_EDGE);
189 if (iterator != s->edges.end()) {
190 std::cout << std::string("reuse state ") << s->stateNumber << std::string(" edge to ") << iterator->second->stateNumber << std::endl;
194 if (iterator != s->edges.end())
195 retval = iterator->second;
197 _edgeLock.readUnlock();
201 dfa::DFAState *LexerATNSimulator::computeTargetState(CharStream *input, dfa::DFAState *s, size_t t) {
202 OrderedATNConfigSet *reach = new OrderedATNConfigSet(); /* mem-check: deleted on error or managed by new DFA state. */
204 // if we don't find an existing DFA state
205 // Fill reach starting from closure, following t transitions
206 getReachableConfigSet(input, s->configs.get(), reach, t);
208 if (reach->isEmpty()) { // we got nowhere on t from s
209 if (!reach->hasSemanticContext) {
210 // we got nowhere on t, don't throw out this knowledge; it'd
211 // cause a failover from DFA later.
213 addDFAEdge(s, t, ERROR.get());
216 // stop when we can't match any more char
220 // Add an edge from s to target DFA found/created for reach
221 return addDFAEdge(s, t, reach);
224 size_t LexerATNSimulator::failOrAccept(CharStream *input, ATNConfigSet *reach, size_t t) {
225 if (_prevAccept.dfaState != nullptr) {
226 Ref<LexerActionExecutor> lexerActionExecutor = _prevAccept.dfaState->lexerActionExecutor;
227 accept(input, lexerActionExecutor, _startIndex, _prevAccept.index, _prevAccept.line, _prevAccept.charPos);
228 return _prevAccept.dfaState->prediction;
230 // if no accept and EOF is first char, return EOF
231 if (t == Token::EOF && input->index() == _startIndex) {
235 throw LexerNoViableAltException(_recog, input, _startIndex, reach);
239 void LexerATNSimulator::getReachableConfigSet(CharStream *input, ATNConfigSet *closure_, ATNConfigSet *reach, size_t t) {
240 // this is used to skip processing for configs which have a lower priority
241 // than a config that already reached an accept state for the same rule
242 size_t skipAlt = ATN::INVALID_ALT_NUMBER;
244 for (auto c : closure_->configs) {
245 bool currentAltReachedAcceptState = c->alt == skipAlt;
246 if (currentAltReachedAcceptState && (std::static_pointer_cast<LexerATNConfig>(c))->hasPassedThroughNonGreedyDecision()) {
251 std::cout << "testing " << getTokenName((int)t) << " at " << c->toString(true) << std::endl;
254 size_t n = c->state->transitions.size();
255 for (size_t ti = 0; ti < n; ti++) { // for each transition
256 Transition *trans = c->state->transitions[ti];
257 ATNState *target = getReachableTarget(trans, (int)t);
258 if (target != nullptr) {
259 Ref<LexerActionExecutor> lexerActionExecutor = std::static_pointer_cast<LexerATNConfig>(c)->getLexerActionExecutor();
260 if (lexerActionExecutor != nullptr) {
261 lexerActionExecutor = lexerActionExecutor->fixOffsetBeforeMatch((int)input->index() - (int)_startIndex);
264 bool treatEofAsEpsilon = t == Token::EOF;
265 Ref<LexerATNConfig> config = std::make_shared<LexerATNConfig>(std::static_pointer_cast<LexerATNConfig>(c),
266 target, lexerActionExecutor);
268 if (closure(input, config, reach, currentAltReachedAcceptState, true, treatEofAsEpsilon)) {
269 // any remaining configs for this alt have a lower priority than
270 // the one that just reached an accept state.
279 void LexerATNSimulator::accept(CharStream *input, const Ref<LexerActionExecutor> &lexerActionExecutor, size_t /*startIndex*/,
280 size_t index, size_t line, size_t charPos) {
282 std::cout << "ACTION ";
283 std::cout << toString(lexerActionExecutor) << std::endl;
286 // seek to after last char in token
289 _charPositionInLine = (int)charPos;
291 if (lexerActionExecutor != nullptr && _recog != nullptr) {
292 lexerActionExecutor->execute(_recog, input, _startIndex);
296 atn::ATNState *LexerATNSimulator::getReachableTarget(Transition *trans, size_t t) {
297 if (trans->matches(t, Lexer::MIN_CHAR_VALUE, Lexer::MAX_CHAR_VALUE)) {
298 return trans->target;
304 std::unique_ptr<ATNConfigSet> LexerATNSimulator::computeStartState(CharStream *input, ATNState *p) {
305 Ref<PredictionContext> initialContext = PredictionContext::EMPTY; // ml: the purpose of this assignment is unclear
306 std::unique_ptr<ATNConfigSet> configs(new OrderedATNConfigSet());
307 for (size_t i = 0; i < p->transitions.size(); i++) {
308 ATNState *target = p->transitions[i]->target;
309 Ref<LexerATNConfig> c = std::make_shared<LexerATNConfig>(target, (int)(i + 1), initialContext);
310 closure(input, c, configs.get(), false, false, false);
316 bool LexerATNSimulator::closure(CharStream *input, const Ref<LexerATNConfig> &config, ATNConfigSet *configs,
317 bool currentAltReachedAcceptState, bool speculative, bool treatEofAsEpsilon) {
319 std::cout << "closure(" << config->toString(true) << ")" << std::endl;
322 if (is<RuleStopState *>(config->state)) {
324 if (_recog != nullptr) {
325 std::cout << "closure at " << _recog->getRuleNames()[config->state->ruleIndex] << " rule stop " << config << std::endl;
327 std::cout << "closure at rule stop " << config << std::endl;
331 if (config->context == nullptr || config->context->hasEmptyPath()) {
332 if (config->context == nullptr || config->context->isEmpty()) {
333 configs->add(config);
336 configs->add(std::make_shared<LexerATNConfig>(config, config->state, PredictionContext::EMPTY));
337 currentAltReachedAcceptState = true;
341 if (config->context != nullptr && !config->context->isEmpty()) {
342 for (size_t i = 0; i < config->context->size(); i++) {
343 if (config->context->getReturnState(i) != PredictionContext::EMPTY_RETURN_STATE) {
344 std::weak_ptr<PredictionContext> newContext = config->context->getParent(i); // "pop" return state
345 ATNState *returnState = atn.states[config->context->getReturnState(i)];
346 Ref<LexerATNConfig> c = std::make_shared<LexerATNConfig>(config, returnState, newContext.lock());
347 currentAltReachedAcceptState = closure(input, c, configs, currentAltReachedAcceptState, speculative, treatEofAsEpsilon);
352 return currentAltReachedAcceptState;
356 if (!config->state->epsilonOnlyTransitions) {
357 if (!currentAltReachedAcceptState || !config->hasPassedThroughNonGreedyDecision()) {
358 configs->add(config);
362 ATNState *p = config->state;
363 for (size_t i = 0; i < p->transitions.size(); i++) {
364 Transition *t = p->transitions[i];
365 Ref<LexerATNConfig> c = getEpsilonTarget(input, config, t, configs, speculative, treatEofAsEpsilon);
367 currentAltReachedAcceptState = closure(input, c, configs, currentAltReachedAcceptState, speculative, treatEofAsEpsilon);
371 return currentAltReachedAcceptState;
374 Ref<LexerATNConfig> LexerATNSimulator::getEpsilonTarget(CharStream *input, const Ref<LexerATNConfig> &config, Transition *t,
375 ATNConfigSet *configs, bool speculative, bool treatEofAsEpsilon) {
377 Ref<LexerATNConfig> c = nullptr;
378 switch (t->getSerializationType()) {
379 case Transition::RULE: {
380 RuleTransition *ruleTransition = static_cast<RuleTransition*>(t);
381 Ref<PredictionContext> newContext = SingletonPredictionContext::create(config->context, ruleTransition->followState->stateNumber);
382 c = std::make_shared<LexerATNConfig>(config, t->target, newContext);
386 case Transition::PRECEDENCE:
387 throw UnsupportedOperationException("Precedence predicates are not supported in lexers.");
389 case Transition::PREDICATE: {
390 /* Track traversing semantic predicates. If we traverse,
391 we cannot add a DFA state for this "reach" computation
392 because the DFA would not test the predicate again in the
393 future. Rather than creating collections of semantic predicates
394 like v3 and testing them on prediction, v4 will test them on the
395 fly all the time using the ATN not the DFA. This is slower but
396 semantically it's not used that often. One of the key elements to
397 this predicate mechanism is not adding DFA states that see
398 predicates immediately afterwards in the ATN. For example,
400 a : ID {p1}? | ID {p2}? ;
402 should create the start state for rule 'a' (to save start state
403 competition), but should not create target of ID state. The
404 collection of ATN states the following ID references includes
405 states reached by traversing predicates. Since this is when we
406 test them, we cannot cash the DFA state target of ID.
408 PredicateTransition *pt = static_cast<PredicateTransition*>(t);
411 std::cout << "EVAL rule " << pt->ruleIndex << ":" << pt->predIndex << std::endl;
414 configs->hasSemanticContext = true;
415 if (evaluatePredicate(input, pt->ruleIndex, pt->predIndex, speculative)) {
416 c = std::make_shared<LexerATNConfig>(config, t->target);
421 case Transition::ACTION:
422 if (config->context == nullptr|| config->context->hasEmptyPath()) {
423 // execute actions anywhere in the start rule for a token.
425 // TODO: if the entry rule is invoked recursively, some
426 // actions may be executed during the recursive call. The
427 // problem can appear when hasEmptyPath() is true but
428 // isEmpty() is false. In this case, the config needs to be
429 // split into two contexts - one with just the empty path
430 // and another with everything but the empty path.
431 // Unfortunately, the current algorithm does not allow
432 // getEpsilonTarget to return two configurations, so
433 // additional modifications are needed before we can support
434 // the split operation.
435 Ref<LexerActionExecutor> lexerActionExecutor = LexerActionExecutor::append(config->getLexerActionExecutor(),
436 atn.lexerActions[static_cast<ActionTransition *>(t)->actionIndex]);
437 c = std::make_shared<LexerATNConfig>(config, t->target, lexerActionExecutor);
441 // ignore actions in referenced rules
442 c = std::make_shared<LexerATNConfig>(config, t->target);
446 case Transition::EPSILON:
447 c = std::make_shared<LexerATNConfig>(config, t->target);
450 case Transition::ATOM:
451 case Transition::RANGE:
452 case Transition::SET:
453 if (treatEofAsEpsilon) {
454 if (t->matches(Token::EOF, Lexer::MIN_CHAR_VALUE, Lexer::MAX_CHAR_VALUE)) {
455 c = std::make_shared<LexerATNConfig>(config, t->target);
462 default: // To silence the compiler. Other transition types are not used here.
469 bool LexerATNSimulator::evaluatePredicate(CharStream *input, size_t ruleIndex, size_t predIndex, bool speculative) {
470 // assume true if no recognizer was provided
471 if (_recog == nullptr) {
476 return _recog->sempred(nullptr, ruleIndex, predIndex);
479 size_t savedCharPositionInLine = _charPositionInLine;
480 size_t savedLine = _line;
481 size_t index = input->index();
482 ssize_t marker = input->mark();
484 auto onExit = finally([this, input, savedCharPositionInLine, savedLine, index, marker] {
485 _charPositionInLine = savedCharPositionInLine;
488 input->release(marker);
492 return _recog->sempred(nullptr, ruleIndex, predIndex);
495 void LexerATNSimulator::captureSimState(CharStream *input, dfa::DFAState *dfaState) {
496 _prevAccept.index = input->index();
497 _prevAccept.line = _line;
498 _prevAccept.charPos = _charPositionInLine;
499 _prevAccept.dfaState = dfaState;
502 dfa::DFAState *LexerATNSimulator::addDFAEdge(dfa::DFAState *from, size_t t, ATNConfigSet *q) {
503 /* leading to this call, ATNConfigSet.hasSemanticContext is used as a
504 * marker indicating dynamic predicate evaluation makes this edge
505 * dependent on the specific input sequence, so the static edge in the
506 * DFA should be omitted. The target DFAState is still created since
507 * execATN has the ability to resynchronize with the DFA state cache
508 * following the predicate evaluation step.
510 * TJP notes: next time through the DFA, we see a pred again and eval.
511 * If that gets us to a previously created (but dangling) DFA
512 * state, we can continue in pure DFA mode from there.
514 bool suppressEdge = q->hasSemanticContext;
515 q->hasSemanticContext = false;
517 dfa::DFAState *to = addDFAState(q);
523 addDFAEdge(from, t, to);
527 void LexerATNSimulator::addDFAEdge(dfa::DFAState *p, size_t t, dfa::DFAState *q) {
528 if (/*t < MIN_DFA_EDGE ||*/ t > MAX_DFA_EDGE) { // MIN_DFA_EDGE is 0
529 // Only track edges within the DFA bounds
533 _edgeLock.writeLock();
534 p->edges[t - MIN_DFA_EDGE] = q; // connect
535 _edgeLock.writeUnlock();
538 dfa::DFAState *LexerATNSimulator::addDFAState(ATNConfigSet *configs) {
539 /* the lexer evaluates predicates on-the-fly; by this point configs
540 * should not contain any configurations with unevaluated predicates.
542 assert(!configs->hasSemanticContext);
544 dfa::DFAState *proposed = new dfa::DFAState(std::unique_ptr<ATNConfigSet>(configs)); /* mem-check: managed by the DFA or deleted below */
545 Ref<ATNConfig> firstConfigWithRuleStopState = nullptr;
546 for (auto &c : configs->configs) {
547 if (is<RuleStopState *>(c->state)) {
548 firstConfigWithRuleStopState = c;
553 if (firstConfigWithRuleStopState != nullptr) {
554 proposed->isAcceptState = true;
555 proposed->lexerActionExecutor = std::dynamic_pointer_cast<LexerATNConfig>(firstConfigWithRuleStopState)->getLexerActionExecutor();
556 proposed->prediction = atn.ruleToTokenType[firstConfigWithRuleStopState->state->ruleIndex];
559 dfa::DFA &dfa = _decisionToDFA[_mode];
561 _stateLock.writeLock();
562 if (!dfa.states.empty()) {
563 auto iterator = dfa.states.find(proposed);
564 if (iterator != dfa.states.end()) {
566 _stateLock.writeUnlock();
571 proposed->stateNumber = (int)dfa.states.size();
572 proposed->configs->setReadonly(true);
574 dfa.states.insert(proposed);
575 _stateLock.writeUnlock();
580 dfa::DFA& LexerATNSimulator::getDFA(size_t mode) {
581 return _decisionToDFA[mode];
584 std::string LexerATNSimulator::getText(CharStream *input) {
585 // index is first lookahead char, don't include.
586 return input->getText(misc::Interval(_startIndex, input->index() - 1));
589 size_t LexerATNSimulator::getLine() const {
593 void LexerATNSimulator::setLine(size_t line) {
597 size_t LexerATNSimulator::getCharPositionInLine() {
598 return _charPositionInLine;
601 void LexerATNSimulator::setCharPositionInLine(size_t charPositionInLine) {
602 _charPositionInLine = charPositionInLine;
605 void LexerATNSimulator::consume(CharStream *input) {
606 size_t curChar = input->LA(1);
607 if (curChar == '\n') {
609 _charPositionInLine = 0;
611 _charPositionInLine++;
616 std::string LexerATNSimulator::getTokenName(size_t t) {
617 if (t == Token::EOF) {
620 return std::string("'") + static_cast<char>(t) + std::string("'");
623 void LexerATNSimulator::InitializeInstanceFields() {
626 _charPositionInLine = 0;
627 _mode = antlr4::Lexer::DEFAULT_MODE;