summaryrefslogtreecommitdiff
path: root/deps/v8/src/torque/earley-parser.h
blob: 1e77734ab66e182a95ea12c28fbd131b82f915da (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
// Copyright 2018 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#ifndef V8_TORQUE_EARLEY_PARSER_H_
#define V8_TORQUE_EARLEY_PARSER_H_

#include <map>
#include <vector>

#include "src/base/optional.h"
#include "src/torque/contextual.h"
#include "src/torque/source-positions.h"
#include "src/torque/utils.h"

namespace v8 {
namespace internal {
namespace torque {

class Symbol;
class Item;

class ParseResultHolderBase {
 public:
  enum class TypeId;
  virtual ~ParseResultHolderBase() = default;
  template <class T>
  T& Cast();
  template <class T>
  const T& Cast() const;

 protected:
  explicit ParseResultHolderBase(TypeId type_id) : type_id_(type_id) {
    // MSVC wrongly complains about type_id_ being an unused private field.
    USE(type_id_);
  }

 private:
  const TypeId type_id_;
};

using ParseResultTypeId = ParseResultHolderBase::TypeId;

template <class T>
class ParseResultHolder : public ParseResultHolderBase {
 public:
  explicit ParseResultHolder(T value)
      : ParseResultHolderBase(id), value_(std::move(value)) {}

 private:
  V8_EXPORT_PRIVATE static const TypeId id;
  friend class ParseResultHolderBase;
  T value_;
};

template <class T>
T& ParseResultHolderBase::Cast() {
  CHECK_EQ(ParseResultHolder<T>::id, type_id_);
  return static_cast<ParseResultHolder<T>*>(this)->value_;
}

template <class T>
const T& ParseResultHolderBase::Cast() const {
  CHECK_EQ(ParseResultHolder<T>::id, type_id_);
  return static_cast<const ParseResultHolder<T>*>(this)->value_;
}

class ParseResult {
 public:
  template <class T>
  explicit ParseResult(T x) : value_(new ParseResultHolder<T>(std::move(x))) {}

  template <class T>
  const T& Cast() const {
    return value_->Cast<T>();
  }
  template <class T>
  T& Cast() {
    return value_->Cast<T>();
  }

 private:
  std::unique_ptr<ParseResultHolderBase> value_;
};

using InputPosition = const char*;

struct MatchedInput {
  MatchedInput(InputPosition begin, InputPosition end, SourcePosition pos)
      : begin(begin), end(end), pos(pos) {}
  InputPosition begin;
  InputPosition end;
  SourcePosition pos;
  std::string ToString() const { return {begin, end}; }
};

class ParseResultIterator {
 public:
  explicit ParseResultIterator(std::vector<ParseResult> results,
                               MatchedInput matched_input)
      : results_(std::move(results)), matched_input_(matched_input) {}
  ~ParseResultIterator() {
    // Check that all parse results have been used.
    CHECK_EQ(results_.size(), i_);
  }

  ParseResult Next() {
    CHECK_LT(i_, results_.size());
    return std::move(results_[i_++]);
  }
  template <class T>
  T NextAs() {
    return std::move(Next().Cast<T>());
  }
  bool HasNext() const { return i_ < results_.size(); }

  const MatchedInput& matched_input() const { return matched_input_; }

 private:
  std::vector<ParseResult> results_;
  size_t i_ = 0;
  MatchedInput matched_input_;

  DISALLOW_COPY_AND_MOVE_AND_ASSIGN(ParseResultIterator);
};

struct LexerResult {
  std::vector<Symbol*> token_symbols;
  std::vector<MatchedInput> token_contents;
};

using Action =
    base::Optional<ParseResult> (*)(ParseResultIterator* child_results);

inline base::Optional<ParseResult> DefaultAction(
    ParseResultIterator* child_results) {
  if (!child_results->HasNext()) return base::nullopt;
  return child_results->Next();
}

// A rule of the context-free grammar. Each rule can have an action attached to
// it, which is executed after the parsing is finished.
class Rule final {
 public:
  explicit Rule(std::vector<Symbol*> right_hand_side,
                Action action = DefaultAction)
      : right_hand_side_(std::move(right_hand_side)), action_(action) {}

  Symbol* left() const {
    DCHECK_NOT_NULL(left_hand_side_);
    return left_hand_side_;
  }
  const std::vector<Symbol*>& right() const { return right_hand_side_; }

  void SetLeftHandSide(Symbol* left_hand_side) {
    DCHECK_NULL(left_hand_side_);
    left_hand_side_ = left_hand_side;
  }

  V8_EXPORT_PRIVATE base::Optional<ParseResult> RunAction(
      const Item* completed_item, const LexerResult& tokens) const;

 private:
  Symbol* left_hand_side_ = nullptr;
  std::vector<Symbol*> right_hand_side_;
  Action action_;
};

// A Symbol represents a terminal or a non-terminal of the grammar.
// It stores the list of rules, which have this symbol as the
// left-hand side.
// Terminals have an empty list of rules, they are created by the Lexer
// instead of from rules.
// Symbols need to reside at stable memory addresses, because the addresses are
// used in the parser.
class Symbol {
 public:
  Symbol() : Symbol({}) {}
  Symbol(std::initializer_list<Rule> rules) { *this = rules; }

  V8_EXPORT_PRIVATE Symbol& operator=(std::initializer_list<Rule> rules);

  bool IsTerminal() const { return rules_.empty(); }
  Rule* rule(size_t index) const { return rules_[index].get(); }
  size_t rule_number() const { return rules_.size(); }

  void AddRule(const Rule& rule) {
    rules_.push_back(base::make_unique<Rule>(rule));
    rules_.back()->SetLeftHandSide(this);
  }

  V8_EXPORT_PRIVATE base::Optional<ParseResult> RunAction(
      const Item* item, const LexerResult& tokens);

 private:
  std::vector<std::unique_ptr<Rule>> rules_;

  // Disallow copying and moving to ensure Symbol has a stable address.
  DISALLOW_COPY_AND_MOVE_AND_ASSIGN(Symbol);
};

// Items are the core datastructure of Earley's algorithm.
// They consist of a (partially) matched rule, a marked position inside of the
// right-hand side of the rule (traditionally written as a dot) and an input
// range from {start} to {pos} that matches the symbols of the right-hand side
// that are left of the mark. In addition, they store a child and a left-sibling
// pointer to reconstruct the AST in the end.
class Item {
 public:
  Item(const Rule* rule, size_t mark, size_t start, size_t pos)
      : rule_(rule), mark_(mark), start_(start), pos_(pos) {
    DCHECK_LE(mark_, right().size());
  }

  // A complete item has the mark at the right end, which means the input range
  // matches the complete rule.
  bool IsComplete() const {
    DCHECK_LE(mark_, right().size());
    return mark_ == right().size();
  }

  // The symbol right after the mark is expected at {pos} for this item to
  // advance.
  Symbol* NextSymbol() const {
    DCHECK(!IsComplete());
    DCHECK_LT(mark_, right().size());
    return right()[mark_];
  }

  // We successfully parsed NextSymbol() between {pos} and {new_pos}.
  // If NextSymbol() was a non-terminal, then {child} is a pointer to a
  // completed item for this parse.
  // We create a new item, which moves the mark one forward.
  Item Advance(size_t new_pos, const Item* child = nullptr) const {
    if (child) {
      DCHECK(child->IsComplete());
      DCHECK_EQ(pos(), child->start());
      DCHECK_EQ(new_pos, child->pos());
      DCHECK_EQ(NextSymbol(), child->left());
    }
    Item result(rule_, mark_ + 1, start_, new_pos);
    result.prev_ = this;
    result.child_ = child;
    return result;
  }

  // Collect the items representing the AST children of this completed item.
  std::vector<const Item*> Children() const;
  // The matched input separated according to the next branching AST level.
  std::string SplitByChildren(const LexerResult& tokens) const;
  // Check if {other} results in the same AST as this Item.
  void CheckAmbiguity(const Item& other, const LexerResult& tokens) const;

  MatchedInput GetMatchedInput(const LexerResult& tokens) const {
    return {tokens.token_contents[start_].begin,
            start_ == pos_ ? tokens.token_contents[start_].begin
                           : tokens.token_contents[pos_ - 1].end,
            tokens.token_contents[start_].pos};
  }

  // We exclude {prev_} and {child_} from equality and hash computations,
  // because they are just globally unique data associated with an item.
  bool operator==(const Item& other) const {
    return rule_ == other.rule_ && mark_ == other.mark_ &&
           start_ == other.start_ && pos_ == other.pos_;
  }

  friend size_t hash_value(const Item& i) {
    return base::hash_combine(i.rule_, i.mark_, i.start_, i.pos_);
  }

  const Rule* rule() const { return rule_; }
  Symbol* left() const { return rule_->left(); }
  const std::vector<Symbol*>& right() const { return rule_->right(); }
  size_t pos() const { return pos_; }
  size_t start() const { return start_; }

 private:
  const Rule* rule_;
  size_t mark_;
  size_t start_;
  size_t pos_;

  const Item* prev_ = nullptr;
  const Item* child_ = nullptr;
};

inline base::Optional<ParseResult> Symbol::RunAction(
    const Item* item, const LexerResult& tokens) {
  DCHECK(item->IsComplete());
  DCHECK_EQ(item->left(), this);
  return item->rule()->RunAction(item, tokens);
}

V8_EXPORT_PRIVATE const Item* RunEarleyAlgorithm(
    Symbol* start, const LexerResult& tokens,
    std::unordered_set<Item, base::hash<Item>>* processed);

inline base::Optional<ParseResult> ParseTokens(Symbol* start,
                                               const LexerResult& tokens) {
  std::unordered_set<Item, base::hash<Item>> table;
  const Item* final_item = RunEarleyAlgorithm(start, tokens, &table);
  return start->RunAction(final_item, tokens);
}

// The lexical syntax is dynamically defined while building the grammar by
// adding patterns and keywords to the Lexer.
// The term keyword here can stand for any fixed character sequence, including
// operators and parentheses.
// Each pattern or keyword automatically gets a terminal symbol associated with
// it. These symbols form the result of the lexing.
// Patterns and keywords are matched using the longest match principle. If the
// longest matching pattern coincides with a keyword, the keyword symbol is
// chosen instead of the pattern.
// In addition, there is a single whitespace pattern which is consumed but does
// not become part of the token list.
class Lexer {
 public:
  // Functions to define patterns. They try to match starting from {pos}. If
  // successful, they return true and advance {pos}. Otherwise, {pos} stays
  // unchanged.
  using PatternFunction = bool (*)(InputPosition* pos);

  void SetWhitespace(PatternFunction whitespace) {
    match_whitespace_ = whitespace;
  }

  Symbol* Pattern(PatternFunction pattern) { return &patterns_[pattern]; }
  Symbol* Token(const std::string& keyword) { return &keywords_[keyword]; }
  V8_EXPORT_PRIVATE LexerResult RunLexer(const std::string& input);

 private:
  PatternFunction match_whitespace_ = [](InputPosition*) { return false; };
  std::map<PatternFunction, Symbol> patterns_;
  std::map<std::string, Symbol> keywords_;
  Symbol* MatchToken(InputPosition* pos, InputPosition end);
};

// A grammar can have a result, which is the results of the start symbol.
// Grammar is intended to be subclassed, with Symbol members forming the
// mutually recursive rules of the grammar.
class Grammar {
 public:
  using PatternFunction = Lexer::PatternFunction;

  explicit Grammar(Symbol* start) : start_(start) {}

  base::Optional<ParseResult> Parse(const std::string& input) {
    LexerResult tokens = lexer().RunLexer(input);
    return ParseTokens(start_, tokens);
  }

 protected:
  Symbol* Token(const std::string& s) { return lexer_.Token(s); }
  Symbol* Pattern(PatternFunction pattern) { return lexer_.Pattern(pattern); }
  void SetWhitespace(PatternFunction ws) { lexer_.SetWhitespace(ws); }

  // NewSymbol() allocates a fresh symbol and stores it in the current grammar.
  // This is necessary to define helpers that create new symbols.
  Symbol* NewSymbol(std::initializer_list<Rule> rules = {}) {
    Symbol* result = new Symbol(rules);
    generated_symbols_.push_back(std::unique_ptr<Symbol>(result));
    return result;
  }

  // Helper functions to define lexer patterns. If they match, they return true
  // and advance {pos}. Otherwise, {pos} is unchanged.
  V8_EXPORT_PRIVATE static bool MatchChar(int (*char_class)(int),
                                          InputPosition* pos);
  V8_EXPORT_PRIVATE static bool MatchChar(bool (*char_class)(char),
                                          InputPosition* pos);
  V8_EXPORT_PRIVATE static bool MatchAnyChar(InputPosition* pos);
  V8_EXPORT_PRIVATE static bool MatchString(const char* s, InputPosition* pos);

  // The action MatchInput() produces the input matched by the rule as
  // result.
  static base::Optional<ParseResult> YieldMatchedInput(
      ParseResultIterator* child_results) {
    return ParseResult{child_results->matched_input().ToString()};
  }

  // Create a new symbol to parse the given sequence of symbols.
  // At most one of the symbols can return a result.
  Symbol* Sequence(std::vector<Symbol*> symbols) {
    return NewSymbol({Rule(std::move(symbols))});
  }

  template <class T, T value>
  static base::Optional<ParseResult> YieldIntegralConstant(
      ParseResultIterator* child_results) {
    return ParseResult{value};
  }

  template <class T>
  static base::Optional<ParseResult> YieldDefaultValue(
      ParseResultIterator* child_results) {
    return ParseResult{T{}};
  }

  template <class From, class To>
  static base::Optional<ParseResult> CastParseResult(
      ParseResultIterator* child_results) {
    To result = std::move(child_results->NextAs<From>());
    return ParseResult{std::move(result)};
  }

  // Try to parse {s} and return the result of type {Result} casted to {T}.
  // Otherwise, the result is a default-constructed {T}.
  template <class T, class Result = T>
  Symbol* TryOrDefault(Symbol* s) {
    return NewSymbol({Rule({s}, CastParseResult<Result, T>),
                      Rule({}, YieldDefaultValue<T>)});
  }

  template <class T>
  static base::Optional<ParseResult> MakeSingletonVector(
      ParseResultIterator* child_results) {
    T x = child_results->NextAs<T>();
    std::vector<T> result;
    result.push_back(std::move(x));
    return ParseResult{std::move(result)};
  }

  template <class T>
  static base::Optional<ParseResult> MakeExtendedVector(
      ParseResultIterator* child_results) {
    std::vector<T> l = child_results->NextAs<std::vector<T>>();
    T x = child_results->NextAs<T>();
    l.push_back(std::move(x));
    return ParseResult{std::move(l)};
  }

  // For example, NonemptyList(Token("A"), Token(",")) parses any of
  // A or A,A or A,A,A and so on.
  template <class T>
  Symbol* NonemptyList(Symbol* element,
                       base::Optional<Symbol*> separator = {}) {
    Symbol* list = NewSymbol();
    *list = {Rule({element}, MakeSingletonVector<T>),
             separator
                 ? Rule({list, *separator, element}, MakeExtendedVector<T>)
                 : Rule({list, element}, MakeExtendedVector<T>)};
    return list;
  }

  template <class T>
  Symbol* List(Symbol* element, base::Optional<Symbol*> separator = {}) {
    return TryOrDefault<std::vector<T>>(NonemptyList<T>(element, separator));
  }

  template <class T>
  Symbol* Optional(Symbol* x) {
    return TryOrDefault<base::Optional<T>, T>(x);
  }

  Symbol* CheckIf(Symbol* x) {
    return NewSymbol({Rule({x}, YieldIntegralConstant<bool, true>),
                      Rule({}, YieldIntegralConstant<bool, false>)});
  }

  Lexer& lexer() { return lexer_; }

 private:
  Lexer lexer_;
  std::vector<std::unique_ptr<Symbol>> generated_symbols_;
  Symbol* start_;
};

}  // namespace torque
}  // namespace internal
}  // namespace v8

#endif  // V8_TORQUE_EARLEY_PARSER_H_