OmniSciDB  a5dc49c757
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
TableFunctionsFactory_parser.py
Go to the documentation of this file.
1 __all__ = ['Parser']
2 
3 import sys
4 import TableFunctionsFactory_node as tf_node
5 from collections import deque
6 
7 if sys.version_info > (3, 0):
8  from collections.abc import Iterable
9 else:
10  from collections import Iterable
11 
12 
13 class TokenizeException(Exception):
14  pass
15 
16 
17 class ParserException(Exception):
18  pass
19 
20 
21 class Token:
22  LESS = 1 # <
23  GREATER = 2 # >
24  COMMA = 3 # ,
25  EQUAL = 4 # =
26  RARROW = 5 # ->
27  STRING = 6 # reserved for string constants
28  NUMBER = 7 #
29  VBAR = 8 # |
30  BANG = 9 # !
31  LPAR = 10 # (
32  RPAR = 11 # )
33  LSQB = 12 # [
34  RSQB = 13 # ]
35  IDENTIFIER = 14 #
36  COLON = 15 # :
37  BOOLEAN = 16 #
38 
39  def __init__(self, type, lexeme):
40  """
41  Parameters
42  ----------
43  type : int
44  One of the tokens in the list above
45  lexeme : str
46  Corresponding string in the text
47  """
48  self.type = type
49  self.lexeme = lexeme
50 
51  @classmethod
52  def tok_name(cls, token):
53  names = {
54  Token.LESS: "LESS",
55  Token.GREATER: "GREATER",
56  Token.COMMA: "COMMA",
57  Token.EQUAL: "EQUAL",
58  Token.RARROW: "RARROW",
59  Token.STRING: "STRING",
60  Token.NUMBER: "NUMBER",
61  Token.VBAR: "VBAR",
62  Token.BANG: "BANG",
63  Token.LPAR: "LPAR",
64  Token.RPAR: "RPAR",
65  Token.LSQB: "LSQB",
66  Token.RSQB: "RSQB",
67  Token.IDENTIFIER: "IDENTIFIER",
68  Token.COLON: "COLON",
69  Token.BOOLEAN: "BOOLEAN"
70  }
71  return names.get(token)
72 
73  def __str__(self):
74  return 'Token(%s, "%s")' % (Token.tok_name(self.type), self.lexeme)
75 
76  __repr__ = __str__
77 
78 
79 class Tokenize:
80  def __init__(self, line):
81  self._line = line
82  self._tokens = []
83  self.start = 0
84  self.curr = 0
85  self.tokenize()
86 
87  @property
88  def line(self):
89  return self._line
90 
91  @property
92  def tokens(self):
93  return self._tokens
94 
95  def tokenize(self):
96  while not self.is_at_end():
97  self.start = self.curr
98 
99  if self.is_token_whitespace():
100  self.consume_whitespace()
101  elif self.is_number():
102  self.consume_number()
103  elif self.is_token_string():
104  self.consume_string()
105  elif self.is_token_identifier_or_boolean():
107  elif self.can_token_be_double_char():
108  self.consume_double_char()
109  else:
110  self.consume_single_char()
111 
112  def is_at_end(self):
113  return len(self.line) == self.curr
114 
115  def current_token(self):
116  return self.line[self.start:self.curr + 1]
117 
118  def add_token(self, type):
119  lexeme = self.line[self.start:self.curr + 1]
120  self._tokens.append(Token(type, lexeme))
121 
122  def lookahead(self):
123  if self.curr + 1 >= len(self.line):
124  return None
125  return self.line[self.curr + 1]
126 
127  def advance(self):
128  self.curr += 1
129 
130  def peek(self):
131  return self.line[self.curr]
132 
134  char = self.peek()
135  return char in ("-",)
136 
138  ahead = self.lookahead()
139  if ahead == ">":
140  self.advance()
141  self.add_token(Token.RARROW) # ->
142  self.advance()
143  else:
144  self.raise_tokenize_error()
145 
147  char = self.peek()
148  if char == "(":
149  self.add_token(Token.LPAR)
150  elif char == ")":
151  self.add_token(Token.RPAR)
152  elif char == "<":
153  self.add_token(Token.LESS)
154  elif char == ">":
155  self.add_token(Token.GREATER)
156  elif char == ",":
157  self.add_token(Token.COMMA)
158  elif char == "=":
159  self.add_token(Token.EQUAL)
160  elif char == "|":
161  self.add_token(Token.VBAR)
162  elif char == "!":
163  self.add_token(Token.BANG)
164  elif char == "[":
165  self.add_token(Token.LSQB)
166  elif char == "]":
167  self.add_token(Token.RSQB)
168  elif char == ":":
169  self.add_token(Token.COLON)
170  else:
171  self.raise_tokenize_error()
172  self.advance()
173 
175  self.advance()
176 
177  def consume_string(self):
178  """
179  STRING: \".*?\"
180  """
181  while True:
182  char = self.lookahead()
183  curr = self.peek()
184  if char == '"' and curr != '\\':
185  self.advance()
186  break
187  self.advance()
188  self.add_token(Token.STRING)
189  self.advance()
190 
191  def consume_number(self):
192  """
193  NUMBER: [-]([0-9]*[.])?[0-9]+
194  """
195  found_dot = False
196  while True:
197  char = self.lookahead()
198  if char:
199  if char.isdigit():
200  self.advance()
201  elif char == "." and not found_dot:
202  found_dot = True
203  self.advance()
204  else:
205  break
206  else:
207  break
208  self.add_token(Token.NUMBER)
209  self.advance()
210 
212  """
213  IDENTIFIER: [A-Za-z_][A-Za-z0-9_]*
214  """
215  while True:
216  char = self.lookahead()
217  if char and char.isalnum() or char == "_":
218  self.advance()
219  else:
220  break
221  if self.current_token().lower() in ("true", "false"):
222  self.add_token(Token.BOOLEAN)
223  else:
224  self.add_token(Token.IDENTIFIER)
225  self.advance()
226 
228  return self.peek().isalpha() or self.peek() == "_"
229 
230  def is_token_string(self):
231  return self.peek() == '"'
232 
233  def is_number(self):
234  return self.peek().isdigit() or (self.peek() == '-' \
235  and self.lookahead().isdigit())
236 
237  def is_alpha(self):
238  return self.peek().isalpha()
239 
241  return self.peek().isspace()
242 
244  curr = self.curr
245  char = self.peek()
246  raise TokenizeException(
247  'Could not match char "%s" at pos %d on line\n %s' % (char, curr, self.line)
248  )
249 
250 
251 def is_identifier_cursor(identifier):
252  return identifier.lower() == 'cursor'
253 
254 
255 class Parser:
256  def __init__(self, line):
257  self._tokens = Tokenize(line).tokens
258  self._curr = 0
259  self.line = line
260 
261  @property
262  def tokens(self):
263  return self._tokens
264 
265  def is_at_end(self):
266  return self._curr >= len(self._tokens)
267 
268  def current_token(self):
269  return self._tokens[self._curr]
270 
271  def advance(self):
272  self._curr += 1
273 
274  def expect(self, expected_type):
275  curr_token = self.current_token()
276  msg = "Expected token %s but got %s at pos %d.\n Tokens: %s" % (
277  curr_token,
278  Token.tok_name(expected_type),
279  self._curr,
280  self._tokens,
281  )
282  assert curr_token.type == expected_type, msg
283  self.advance()
284 
285  def consume(self, expected_type):
286  """consumes the current token iff its type matches the
287  expected_type. Otherwise, an error is raised
288  """
289  curr_token = self.current_token()
290  if curr_token.type == expected_type:
291  self.advance()
292  return curr_token
293  else:
294  expected_token = Token.tok_name(expected_type)
295  self.raise_parser_error(
296  'Token mismatch at function consume. '
297  'Expected type "%s" but got token "%s"\n\n'
298  'Tokens: %s\n' % (expected_token, curr_token, self._tokens)
299  )
300 
301  def current_pos(self):
302  return self._curr
303 
304  def raise_parser_error(self, msg=None):
305  if not msg:
306  token = self.current_token()
307  pos = self.current_pos()
308  tokens = self.tokens
309  msg = "\n\nError while trying to parse token %s at pos %d.\n" "Tokens: %s" % (
310  token,
311  pos,
312  tokens,
313  )
314  raise ParserException(msg)
315 
316  def match(self, expected_type):
317  curr_token = self.current_token()
318  return curr_token.type == expected_type
319 
320  def lookahead(self):
321  return self._tokens[self._curr + 1]
322 
323  def parse_udtf(self):
324  """fmt: off
325 
326  udtf: IDENTIFIER "(" (args)? ")" ("|" annotation)* "->" args ("," templates)? ("|" "output_row_size" "=" primitive)?
327 
328  fmt: on
329  """
330  name = self.parse_identifier()
331  self.expect(Token.LPAR) # (
332  input_args = []
333  if not self.match(Token.RPAR):
334  input_args = self.parse_args()
335  self.expect(Token.RPAR) # )
336  annotations = []
337  while not self.is_at_end() and self.match(Token.VBAR): # |
338  self.consume(Token.VBAR)
339  annotations.append(self.parse_annotation())
340  self.expect(Token.RARROW) # ->
341  output_args = self.parse_args()
342 
343  templates = None
344  if not self.is_at_end() and self.match(Token.COMMA):
345  self.consume(Token.COMMA)
346  templates = self.parse_templates()
347 
348  sizer = None
349  if not self.is_at_end() and self.match(Token.VBAR):
350  self.consume(Token.VBAR)
351  idtn = self.parse_identifier()
352  assert idtn == "output_row_size", idtn
353  self.consume(Token.EQUAL)
354  node = self.parse_primitive()
355  key = "kPreFlightParameter"
356  sizer = tf_node.AnnotationNode(key, value=node.type)
357 
358  # set arg_pos
359  i = 0
360  for arg in input_args:
361  arg.arg_pos = i
362  arg.kind = "input"
363  i += arg.type.cursor_length() if arg.type.is_cursor() else 1
364 
365  for i, arg in enumerate(output_args):
366  arg.arg_pos = i
367  arg.kind = "output"
368 
369  return tf_node.UdtfNode(name, input_args, output_args, annotations, templates, sizer, self.line)
370 
371  def parse_args(self):
372  """fmt: off
373 
374  args: arg IDENTIFIER ("," arg)*
375 
376  fmt: on
377  """
378  args = []
379  args.append(self.parse_arg())
380  while not self.is_at_end() and self.match(Token.COMMA):
381  curr = self._curr
382  self.consume(Token.COMMA)
383  self.parse_type() # assuming that we are not ending with COMMA
384  if not self.is_at_end() and self.match(Token.EQUAL):
385  # arg type cannot be assigned, so this must be a template specification
386  self._curr = curr # step back and let the code below parse the templates
387  break
388  else:
389  self._curr = curr + 1 # step back from self.parse_type(), parse_arg will parse it again
390  args.append(self.parse_arg())
391  return args
392 
393  def parse_arg(self):
394  """fmt: off
395 
396  arg: type IDENTIFIER? ("|" annotation)*
397 
398  fmt: on
399  """
400  typ = self.parse_type()
401 
402  annotations = []
403 
404  if not self.is_at_end() and self.match(Token.IDENTIFIER):
405  name = self.parse_identifier()
406  annotations.append(tf_node.AnnotationNode('name', name))
407 
408  while not self.is_at_end() and self.match(Token.VBAR):
409  ahead = self.lookahead()
410  if ahead.type == Token.IDENTIFIER and ahead.lexeme == 'output_row_size':
411  break
412  self.consume(Token.VBAR)
413  annotations.append(self.parse_annotation())
414 
415  return tf_node.ArgNode(typ, annotations)
416 
417  def parse_type(self):
418  """fmt: off
419 
420  type: composed
421  | primitive
422 
423  fmt: on
424  """
425  curr = self._curr # save state
426  primitive = self.parse_primitive()
427  if self.is_at_end():
428  return primitive
429 
430  if not self.match(Token.LESS):
431  return primitive
432 
433  self._curr = curr # return state
434 
435  return self.parse_composed()
436 
437  def parse_composed(self):
438  """fmt: off
439 
440  composed: "Cursor" "<" arg ("," arg)* ">"
441  | IDENTIFIER "<" type ("," type)* ">"
442 
443  fmt: on
444  """
445  idtn = self.parse_identifier()
446  self.consume(Token.LESS)
447  if is_identifier_cursor(idtn):
448  inner = [self.parse_arg()]
449  while self.match(Token.COMMA):
450  self.consume(Token.COMMA)
451  inner.append(self.parse_arg())
452  else:
453  inner = [self.parse_type()]
454  while self.match(Token.COMMA):
455  self.consume(Token.COMMA)
456  inner.append(self.parse_type())
457  self.consume(Token.GREATER)
458  return tf_node.ComposedNode(idtn, inner)
459 
460  def parse_primitive(self):
461  """fmt: off
462 
463  primitive: IDENTIFIER
464  | NUMBER
465  | STRING
466  | BOOLEAN
467 
468  fmt: on
469  """
470  if self.match(Token.IDENTIFIER):
471  lexeme = self.parse_identifier()
472  elif self.match(Token.NUMBER):
473  lexeme = self.parse_number()
474  elif self.match(Token.STRING):
475  lexeme = self.parse_string()
476  elif self.match(Token.BOOLEAN):
477  lexeme = self.parse_boolean()
478  else:
479  raise self.raise_parser_error()
480  return tf_node.PrimitiveNode(lexeme)
481 
482  def parse_templates(self):
483  """fmt: off
484 
485  templates: template ("," template)*
486 
487  fmt: on
488  """
489  T = []
490  T.append(self.parse_template())
491  while not self.is_at_end() and self.match(Token.COMMA):
492  self.consume(Token.COMMA)
493  T.append(self.parse_template())
494  return T
495 
496  def parse_template(self):
497  """fmt: off
498 
499  template: IDENTIFIER "=" "[" IDENTIFIER ("," IDENTIFIER)* "]"
500 
501  fmt: on
502  """
503  key = self.parse_identifier()
504  types = []
505  self.consume(Token.EQUAL)
506  self.consume(Token.LSQB)
507  types.append(self.parse_identifier())
508  while self.match(Token.COMMA):
509  self.consume(Token.COMMA)
510  types.append(self.parse_identifier())
511  self.consume(Token.RSQB)
512  return tf_node.TemplateNode(key, tuple(types))
513 
514  def parse_annotation(self):
515  """fmt: off
516 
517  annotation: IDENTIFIER "=" IDENTIFIER ("<" NUMBER ("," NUMBER) ">")?
518  | IDENTIFIER "=" "[" PRIMITIVE? ("," PRIMITIVE)* "]"
519  | "require" "=" STRING
520  | "default" "=" STRING | NUMBER | BOOLEAN
521 
522  fmt: on
523  """
524  key = self.parse_identifier()
525  self.consume(Token.EQUAL)
526 
527  if key == "require":
528  value = self.parse_string()
529  elif key == "default":
530  if self.match(Token.NUMBER):
531  value = self.parse_number()
532  elif self.match(Token.STRING):
533  value = self.parse_string()
534  elif self.match(Token.BOOLEAN):
535  value = self.parse_boolean()
536  else:
537  self.raise_parser_error(
538  'Unable to parse value in \"default\" annotation.\n'
539  'Expected type NUMBER, STRING or BOOLEAN.\n'
540  'Found token: "%s" of type "%s" \n'
541  % (self.current_token().lexeme, Token.tok_name(self.current_token().type))
542  )
543  elif not self.is_at_end() and self.match(Token.LSQB):
544  value = []
545  self.consume(Token.LSQB)
546  if not self.match(Token.RSQB):
547  value.append(self.parse_primitive())
548  while self.match(Token.COMMA):
549  self.consume(Token.COMMA)
550  value.append(self.parse_primitive())
551  self.consume(Token.RSQB)
552  else:
553  value = self.parse_identifier()
554  if not self.is_at_end() and self.match(Token.LESS):
555  self.consume(Token.LESS)
556  if self.match(Token.GREATER):
557  value += "<%s>" % (-1) # Signifies no input
558  else:
559  num1 = self.parse_number()
560  if self.match(Token.COMMA):
561  self.consume(Token.COMMA)
562  num2 = self.parse_number()
563  value += "<%s,%s>" % (num1, num2)
564  else:
565  value += "<%s>" % (num1)
566  self.consume(Token.GREATER)
567  return tf_node.AnnotationNode(key, value)
568 
569  def parse_identifier(self):
570  """ fmt: off
571 
572  IDENTIFIER: [A-Za-z_][A-Za-z0-9_]*
573 
574  fmt: on
575  """
576  token = self.consume(Token.IDENTIFIER)
577  return token.lexeme
578 
579  def parse_string(self):
580  """ fmt: off
581 
582  STRING: \".*?\"
583 
584  fmt: on
585  """
586  token = self.consume(Token.STRING)
587  return token.lexeme
588 
589  def parse_number(self):
590  """ fmt: off
591 
592  NUMBER: [-]([0-9]*[.])?[0-9]+
593 
594  fmt: on
595  """
596  token = self.consume(Token.NUMBER)
597  return token.lexeme
598 
599  def parse_boolean(self):
600  """ fmt: off
601 
602  BOOLEAN: \bTrue\b|\bFalse\b
603 
604  fmt: on
605  """
606  token = self.consume(Token.BOOLEAN)
607  # Make sure booleans are normalized to "False" or "True" regardless
608  # of original capitalization, so they can be properly parsed during
609  # typechecking
610  new_token = token.lexeme.lower().capitalize()
611  return new_token
612 
613  def parse(self):
614  """fmt: off
615 
616  udtf: IDENTIFIER "(" (args)? ")" ("|" annotation)* "->" args ("," templates)? ("|" "output_row_size" "=" primitive)?
617 
618  args: arg ("," arg)*
619 
620  arg: type IDENTIFIER? ("|" annotation)*
621 
622  type: composed
623  | primitive
624 
625  composed: "Cursor" "<" arg ("," arg)* ">"
626  | IDENTIFIER "<" type ("," type)* ">"
627 
628  primitive: IDENTIFIER
629  | NUMBER
630  | STRING
631  | BOOLEAN
632 
633  annotation: IDENTIFIER "=" IDENTIFIER ("<" NUMBER ("," NUMBER) ">")?
634  | IDENTIFIER "=" "[" PRIMITIVE? ("," PRIMITIVE)* "]"
635  | "require" "=" STRING
636  | "default" "=" STRING | NUMBER | BOOLEAN
637 
638  templates: template ("," template)
639  template: IDENTIFIER "=" "[" IDENTIFIER ("," IDENTIFIER)* "]"
640 
641  IDENTIFIER: [A-Za-z_][A-Za-z0-9_]*
642  NUMBER: [0-9]+
643  STRING: \".*?\"
644  BOOLEAN: \bTrue\b|\bFalse\b
645 
646  fmt: on
647  """
648  self._curr = 0
649  udtf = self.parse_udtf()
650 
651  # set parent
652  udtf.parent = None
653  d = deque()
654  d.append(udtf)
655  while d:
656  node = d.pop()
657  if isinstance(node, Iterable):
658  for child in node:
659  child.parent = node
660  d.append(child)
661  return udtf