1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 __version__ = "2.5"
26 __tabversion__ = "2.4"
27
28 import re, sys, types, copy, os
29
30
31 _is_identifier = re.compile(r'^[a-zA-Z0-9_]+$')
32
33
34
35
36
37
38 try:
39 _INSTANCETYPE = (types.InstanceType, types.ObjectType)
40 except AttributeError:
41 _INSTANCETYPE = types.InstanceType
43
44
45
46
49 self.args = (message,)
50 self.text = s
51
52
53
58 if not self.warned:
59 sys.stderr.write("ply.lex: Warning: " + msg+"\n")
60 self.warned = 1
61
62 _SkipWarning = LexWarning()
63
64
71 self.lexer.skip(n)
72 _SkipWarning("Calling t.skip() on a token is deprecated. Please use t.lexer.skip()")
73
74
75
76
77
78
79
80
81
82
85 self.lexre = None
86
87
88
89 self.lexretext = None
90 self.lexstatere = {}
91 self.lexstateretext = {}
92 self.lexstaterenames = {}
93 self.lexstate = "INITIAL"
94 self.lexstatestack = []
95 self.lexstateinfo = None
96 self.lexstateignore = {}
97 self.lexstateerrorf = {}
98 self.lexreflags = 0
99 self.lexdata = None
100 self.lexpos = 0
101 self.lexlen = 0
102 self.lexerrorf = None
103 self.lextokens = None
104 self.lexignore = ""
105 self.lexliterals = ""
106 self.lexmodule = None
107 self.lineno = 1
108 self.lexdebug = 0
109 self.lexoptimize = 0
110
111 - def clone(self,object=None):
112 c = copy.copy(self)
113
114
115
116
117
118 if object:
119 newtab = { }
120 for key, ritem in self.lexstatere.items():
121 newre = []
122 for cre, findex in ritem:
123 newfindex = []
124 for f in findex:
125 if not f or not f[0]:
126 newfindex.append(f)
127 continue
128 newfindex.append((getattr(object,f[0].__name__),f[1]))
129 newre.append((cre,newfindex))
130 newtab[key] = newre
131 c.lexstatere = newtab
132 c.lexstateerrorf = { }
133 for key, ef in self.lexstateerrorf.items():
134 c.lexstateerrorf[key] = getattr(object,ef.__name__)
135 c.lexmodule = object
136 return c
137
138
139
140
141 - def writetab(self,tabfile,outputdir=""):
142 if isinstance(tabfile,types.ModuleType):
143 return
144 basetabfilename = tabfile.split(".")[-1]
145 filename = os.path.join(outputdir,basetabfilename)+".py"
146 tf = open(filename,"w")
147 tf.write("# %s.py. This file automatically created by PLY (version %s). Don't edit!\n" % (tabfile,__version__))
148 tf.write("_lextokens = %s\n" % repr(self.lextokens))
149 tf.write("_lexreflags = %s\n" % repr(self.lexreflags))
150 tf.write("_lexliterals = %s\n" % repr(self.lexliterals))
151 tf.write("_lexstateinfo = %s\n" % repr(self.lexstateinfo))
152
153 tabre = { }
154
155 initial = self.lexstatere["INITIAL"]
156 initialfuncs = []
157 for part in initial:
158 for f in part[1]:
159 if f and f[0]:
160 initialfuncs.append(f)
161
162 for key, lre in self.lexstatere.items():
163 titem = []
164 for i in range(len(lre)):
165 titem.append((self.lexstateretext[key][i],_funcs_to_names(lre[i][1],self.lexstaterenames[key][i])))
166 tabre[key] = titem
167
168 tf.write("_lexstatere = %s\n" % repr(tabre))
169 tf.write("_lexstateignore = %s\n" % repr(self.lexstateignore))
170
171 taberr = { }
172 for key, ef in self.lexstateerrorf.items():
173 if ef:
174 taberr[key] = ef.__name__
175 else:
176 taberr[key] = None
177 tf.write("_lexstateerrorf = %s\n" % repr(taberr))
178 tf.close()
179
180
181
182
184 if isinstance(tabfile,types.ModuleType):
185 lextab = tabfile
186 else:
187 exec "import %s as lextab" % tabfile
188 self.lextokens = lextab._lextokens
189 self.lexreflags = lextab._lexreflags
190 self.lexliterals = lextab._lexliterals
191 self.lexstateinfo = lextab._lexstateinfo
192 self.lexstateignore = lextab._lexstateignore
193 self.lexstatere = { }
194 self.lexstateretext = { }
195 for key,lre in lextab._lexstatere.items():
196 titem = []
197 txtitem = []
198 for i in range(len(lre)):
199 titem.append((re.compile(lre[i][0],lextab._lexreflags),_names_to_funcs(lre[i][1],fdict)))
200 txtitem.append(lre[i][0])
201 self.lexstatere[key] = titem
202 self.lexstateretext[key] = txtitem
203 self.lexstateerrorf = { }
204 for key,ef in lextab._lexstateerrorf.items():
205 self.lexstateerrorf[key] = fdict[ef]
206 self.begin('INITIAL')
207
208
209
210
219
220
221
222
224 if not self.lexstatere.has_key(state):
225 raise ValueError, "Undefined state"
226 self.lexre = self.lexstatere[state]
227 self.lexretext = self.lexstateretext[state]
228 self.lexignore = self.lexstateignore.get(state,"")
229 self.lexerrorf = self.lexstateerrorf.get(state,None)
230 self.lexstate = state
231
232
233
234
236 self.lexstatestack.append(self.lexstate)
237 self.begin(state)
238
239
240
241
243 self.begin(self.lexstatestack.pop())
244
245
246
247
250
251
252
253
256
257
258
259
260
261
262
263
265
266 lexpos = self.lexpos
267 lexlen = self.lexlen
268 lexignore = self.lexignore
269 lexdata = self.lexdata
270
271 while lexpos < lexlen:
272
273 if lexdata[lexpos] in lexignore:
274 lexpos += 1
275 continue
276
277
278 for lexre,lexindexfunc in self.lexre:
279 m = lexre.match(lexdata,lexpos)
280 if not m: continue
281
282
283 tok = LexToken()
284 tok.value = m.group()
285 tok.lineno = self.lineno
286 tok.lexpos = lexpos
287
288 i = m.lastindex
289 func,tok.type = lexindexfunc[i]
290
291 if not func:
292
293 if tok.type:
294 self.lexpos = m.end()
295 return tok
296 else:
297 lexpos = m.end()
298 break
299
300 lexpos = m.end()
301
302
303 if not callable(func):
304 break
305
306
307
308 tok.lexer = self
309 self.lexmatch = m
310 self.lexpos = lexpos
311
312 newtok = func(tok)
313
314
315 if not newtok:
316 lexpos = self.lexpos
317 lexignore = self.lexignore
318 break
319
320
321 if not self.lexoptimize:
322 if not self.lextokens.has_key(newtok.type):
323 raise LexError, ("%s:%d: Rule '%s' returned an unknown token type '%s'" % (
324 func.func_code.co_filename, func.func_code.co_firstlineno,
325 func.__name__, newtok.type),lexdata[lexpos:])
326
327 return newtok
328 else:
329
330 if lexdata[lexpos] in self.lexliterals:
331 tok = LexToken()
332 tok.value = lexdata[lexpos]
333 tok.lineno = self.lineno
334 tok.type = tok.value
335 tok.lexpos = lexpos
336 self.lexpos = lexpos + 1
337 return tok
338
339
340 if self.lexerrorf:
341 tok = LexToken()
342 tok.value = self.lexdata[lexpos:]
343 tok.lineno = self.lineno
344 tok.type = "error"
345 tok.lexer = self
346 tok.lexpos = lexpos
347 self.lexpos = lexpos
348 newtok = self.lexerrorf(tok)
349 if lexpos == self.lexpos:
350
351 raise LexError, ("Scanning error. Illegal character '%s'" % (lexdata[lexpos]), lexdata[lexpos:])
352 lexpos = self.lexpos
353 if not newtok: continue
354 return newtok
355
356 self.lexpos = lexpos
357 raise LexError, ("Illegal character '%s' at index %d" % (lexdata[lexpos],lexpos), lexdata[lexpos:])
358
359 self.lexpos = lexpos + 1
360 if self.lexdata is None:
361 raise RuntimeError, "No input string given with input()"
362 return None
363
364
365
366
367
368
369
370
371
372
374 import os.path
375 base,ext = os.path.splitext(filename)
376 if ext != '.py': return 1
377
378 try:
379 f = open(filename)
380 lines = f.readlines()
381 f.close()
382 except IOError:
383 return 1
384
385 fre = re.compile(r'\s*def\s+(t_[a-zA-Z_0-9]*)\(')
386 sre = re.compile(r'\s*(t_[a-zA-Z_0-9]*)\s*=')
387
388 counthash = { }
389 linen = 1
390 noerror = 1
391 for l in lines:
392 m = fre.match(l)
393 if not m:
394 m = sre.match(l)
395 if m:
396 name = m.group(1)
397 prev = counthash.get(name)
398 if not prev:
399 counthash[name] = linen
400 else:
401 print >>sys.stderr, "%s:%d: Rule %s redefined. Previously defined on line %d" % (filename,linen,name,prev)
402 noerror = 0
403 linen += 1
404 return noerror
405
406
407
408
409
410
411
412
414 result = []
415 for f,name in zip(funclist,namelist):
416 if f and f[0]:
417 result.append((name, f[1]))
418 else:
419 result.append(f)
420 return result
421
422
423
424
425
426
427
428
430 result = []
431 for n in namelist:
432 if n and n[0]:
433 result.append((fdict[n[0]],n[1]))
434 else:
435 result.append(n)
436 return result
437
438
439
440
441
442
443
444
445
475
476
477
478
479
480
481
482
483
484
486 nonstate = 1
487 parts = s.split("_")
488 for i in range(1,len(parts)):
489 if not names.has_key(parts[i]) and parts[i] != 'ANY': break
490 if i > 1:
491 states = tuple(parts[1:i])
492 else:
493 states = ('INITIAL',)
494
495 if 'ANY' in states:
496 states = tuple(names.keys())
497
498 tokenname = "_".join(parts[i:])
499 return (states,tokenname)
500
501
502
503
504
505
506 -def lex(module=None,object=None,debug=0,optimize=0,lextab="lextab",reflags=0,nowarn=0,outputdir=""):
507 global lexer
508 ldict = None
509 stateinfo = { 'INITIAL' : 'inclusive'}
510 error = 0
511 files = { }
512 lexobj = Lexer()
513 lexobj.lexdebug = debug
514 lexobj.lexoptimize = optimize
515 global token,input
516
517 if nowarn: warn = 0
518 else: warn = 1
519
520 if object: module = object
521
522 if module:
523
524 if isinstance(module, types.ModuleType):
525 ldict = module.__dict__
526 elif isinstance(module, _INSTANCETYPE):
527 _items = [(k,getattr(module,k)) for k in dir(module)]
528 ldict = { }
529 for (i,v) in _items:
530 ldict[i] = v
531 else:
532 raise ValueError,"Expected a module or instance"
533 lexobj.lexmodule = module
534
535 else:
536
537 try:
538 raise RuntimeError
539 except RuntimeError:
540 e,b,t = sys.exc_info()
541 f = t.tb_frame
542 f = f.f_back
543 if f.f_globals is f.f_locals:
544 ldict = f.f_globals
545 else:
546 ldict = f.f_globals.copy()
547 ldict.update(f.f_locals)
548
549 if optimize and lextab:
550 try:
551 lexobj.readtab(lextab,ldict)
552 token = lexobj.token
553 input = lexobj.input
554 lexer = lexobj
555 return lexobj
556
557 except ImportError:
558 pass
559
560
561
562 tokens = ldict.get("tokens",None)
563 states = ldict.get("states",None)
564 literals = ldict.get("literals","")
565
566 if not tokens:
567 raise SyntaxError,"lex: module does not define 'tokens'"
568
569 if not (isinstance(tokens,types.ListType) or isinstance(tokens,types.TupleType)):
570 raise SyntaxError,"lex: tokens must be a list or tuple."
571
572
573 lexobj.lextokens = { }
574 if not optimize:
575 for n in tokens:
576 if not _is_identifier.match(n):
577 print >>sys.stderr, "lex: Bad token name '%s'" % n
578 error = 1
579 if warn and lexobj.lextokens.has_key(n):
580 print >>sys.stderr, "lex: Warning. Token '%s' multiply defined." % n
581 lexobj.lextokens[n] = None
582 else:
583 for n in tokens: lexobj.lextokens[n] = None
584
585 if debug:
586 print "lex: tokens = '%s'" % lexobj.lextokens.keys()
587
588 try:
589 for c in literals:
590 if not (isinstance(c,types.StringType) or isinstance(c,types.UnicodeType)) or len(c) > 1:
591 print >>sys.stderr, "lex: Invalid literal %s. Must be a single character" % repr(c)
592 error = 1
593 continue
594
595 except TypeError:
596 print >>sys.stderr, "lex: Invalid literals specification. literals must be a sequence of characters."
597 error = 1
598
599 lexobj.lexliterals = literals
600
601
602 if states:
603 if not (isinstance(states,types.TupleType) or isinstance(states,types.ListType)):
604 print >>sys.stderr, "lex: states must be defined as a tuple or list."
605 error = 1
606 else:
607 for s in states:
608 if not isinstance(s,types.TupleType) or len(s) != 2:
609 print >>sys.stderr, "lex: invalid state specifier %s. Must be a tuple (statename,'exclusive|inclusive')" % repr(s)
610 error = 1
611 continue
612 name, statetype = s
613 if not isinstance(name,types.StringType):
614 print >>sys.stderr, "lex: state name %s must be a string" % repr(name)
615 error = 1
616 continue
617 if not (statetype == 'inclusive' or statetype == 'exclusive'):
618 print >>sys.stderr, "lex: state type for state %s must be 'inclusive' or 'exclusive'" % name
619 error = 1
620 continue
621 if stateinfo.has_key(name):
622 print >>sys.stderr, "lex: state '%s' already defined." % name
623 error = 1
624 continue
625 stateinfo[name] = statetype
626
627
628 tsymbols = [f for f in ldict.keys() if f[:2] == 't_' ]
629
630
631
632 funcsym = { }
633 strsym = { }
634 toknames = { }
635
636 for s in stateinfo.keys():
637 funcsym[s] = []
638 strsym[s] = []
639
640 ignore = { }
641 errorf = { }
642
643 if len(tsymbols) == 0:
644 raise SyntaxError,"lex: no rules of the form t_rulename are defined."
645
646 for f in tsymbols:
647 t = ldict[f]
648 states, tokname = _statetoken(f,stateinfo)
649 toknames[f] = tokname
650
651 if callable(t):
652 for s in states: funcsym[s].append((f,t))
653 elif (isinstance(t, types.StringType) or isinstance(t,types.UnicodeType)):
654 for s in states: strsym[s].append((f,t))
655 else:
656 print >>sys.stderr, "lex: %s not defined as a function or string" % f
657 error = 1
658
659
660 for f in funcsym.values():
661 f.sort(lambda x,y: cmp(x[1].func_code.co_firstlineno,y[1].func_code.co_firstlineno))
662
663
664 for s in strsym.values():
665 s.sort(lambda x,y: (len(x[1]) < len(y[1])) - (len(x[1]) > len(y[1])))
666
667 regexs = { }
668
669
670 for state in stateinfo.keys():
671 regex_list = []
672
673
674 for fname, f in funcsym[state]:
675 line = f.func_code.co_firstlineno
676 file = f.func_code.co_filename
677 files[file] = None
678 tokname = toknames[fname]
679
680 ismethod = isinstance(f, types.MethodType)
681
682 if not optimize:
683 nargs = f.func_code.co_argcount
684 if ismethod:
685 reqargs = 2
686 else:
687 reqargs = 1
688 if nargs > reqargs:
689 print >>sys.stderr, "%s:%d: Rule '%s' has too many arguments." % (file,line,f.__name__)
690 error = 1
691 continue
692
693 if nargs < reqargs:
694 print >>sys.stderr, "%s:%d: Rule '%s' requires an argument." % (file,line,f.__name__)
695 error = 1
696 continue
697
698 if tokname == 'ignore':
699 print >>sys.stderr, "%s:%d: Rule '%s' must be defined as a string." % (file,line,f.__name__)
700 error = 1
701 continue
702
703 if tokname == 'error':
704 errorf[state] = f
705 continue
706
707 if f.__doc__:
708 if not optimize:
709 try:
710 c = re.compile("(?P<%s>%s)" % (fname,f.__doc__), re.VERBOSE | reflags)
711 if c.match(""):
712 print >>sys.stderr, "%s:%d: Regular expression for rule '%s' matches empty string." % (file,line,f.__name__)
713 error = 1
714 continue
715 except re.error,e:
716 print >>sys.stderr, "%s:%d: Invalid regular expression for rule '%s'. %s" % (file,line,f.__name__,e)
717 if '#' in f.__doc__:
718 print >>sys.stderr, "%s:%d. Make sure '#' in rule '%s' is escaped with '\\#'." % (file,line, f.__name__)
719 error = 1
720 continue
721
722 if debug:
723 print "lex: Adding rule %s -> '%s' (state '%s')" % (f.__name__,f.__doc__, state)
724
725
726
727
728 regex_list.append("(?P<%s>%s)" % (fname,f.__doc__))
729 else:
730 print >>sys.stderr, "%s:%d: No regular expression defined for rule '%s'" % (file,line,f.__name__)
731
732
733 for name,r in strsym[state]:
734 tokname = toknames[name]
735
736 if tokname == 'ignore':
737 if "\\" in r:
738 print >>sys.stderr, "lex: Warning. %s contains a literal backslash '\\'" % name
739 ignore[state] = r
740 continue
741
742 if not optimize:
743 if tokname == 'error':
744 raise SyntaxError,"lex: Rule '%s' must be defined as a function" % name
745 error = 1
746 continue
747
748 if not lexobj.lextokens.has_key(tokname) and tokname.find("ignore_") < 0:
749 print >>sys.stderr, "lex: Rule '%s' defined for an unspecified token %s." % (name,tokname)
750 error = 1
751 continue
752 try:
753 c = re.compile("(?P<%s>%s)" % (name,r),re.VERBOSE | reflags)
754 if (c.match("")):
755 print >>sys.stderr, "lex: Regular expression for rule '%s' matches empty string." % name
756 error = 1
757 continue
758 except re.error,e:
759 print >>sys.stderr, "lex: Invalid regular expression for rule '%s'. %s" % (name,e)
760 if '#' in r:
761 print >>sys.stderr, "lex: Make sure '#' in rule '%s' is escaped with '\\#'." % name
762
763 error = 1
764 continue
765 if debug:
766 print "lex: Adding rule %s -> '%s' (state '%s')" % (name,r,state)
767
768 regex_list.append("(?P<%s>%s)" % (name,r))
769
770 if not regex_list:
771 print >>sys.stderr, "lex: No rules defined for state '%s'" % state
772 error = 1
773
774 regexs[state] = regex_list
775
776
777 if not optimize:
778 for f in files.keys():
779 if not _validate_file(f):
780 error = 1
781
782 if error:
783 raise SyntaxError,"lex: Unable to build lexer."
784
785
786
787
788
789
790 for state in regexs.keys():
791 lexre, re_text, re_names = _form_master_re(regexs[state],reflags,ldict,toknames)
792 lexobj.lexstatere[state] = lexre
793 lexobj.lexstateretext[state] = re_text
794 lexobj.lexstaterenames[state] = re_names
795 if debug:
796 for i in range(len(re_text)):
797 print "lex: state '%s'. regex[%d] = '%s'" % (state, i, re_text[i])
798
799
800 for state,type in stateinfo.items():
801 if state != "INITIAL" and type == 'inclusive':
802 lexobj.lexstatere[state].extend(lexobj.lexstatere['INITIAL'])
803 lexobj.lexstateretext[state].extend(lexobj.lexstateretext['INITIAL'])
804 lexobj.lexstaterenames[state].extend(lexobj.lexstaterenames['INITIAL'])
805
806 lexobj.lexstateinfo = stateinfo
807 lexobj.lexre = lexobj.lexstatere["INITIAL"]
808 lexobj.lexretext = lexobj.lexstateretext["INITIAL"]
809
810
811 lexobj.lexstateignore = ignore
812 lexobj.lexignore = lexobj.lexstateignore.get("INITIAL","")
813
814
815 lexobj.lexstateerrorf = errorf
816 lexobj.lexerrorf = errorf.get("INITIAL",None)
817 if warn and not lexobj.lexerrorf:
818 print >>sys.stderr, "lex: Warning. no t_error rule is defined."
819
820
821 for s,stype in stateinfo.items():
822 if stype == 'exclusive':
823 if warn and not errorf.has_key(s):
824 print >>sys.stderr, "lex: Warning. no error rule is defined for exclusive state '%s'" % s
825 if warn and not ignore.has_key(s) and lexobj.lexignore:
826 print >>sys.stderr, "lex: Warning. no ignore rule is defined for exclusive state '%s'" % s
827 elif stype == 'inclusive':
828 if not errorf.has_key(s):
829 errorf[s] = errorf.get("INITIAL",None)
830 if not ignore.has_key(s):
831 ignore[s] = ignore.get("INITIAL","")
832
833
834
835 token = lexobj.token
836 input = lexobj.input
837 lexer = lexobj
838
839
840 if lextab and optimize:
841 lexobj.writetab(lextab,outputdir)
842
843 return lexobj
844
845
846
847
848
849
850
851 -def runmain(lexer=None,data=None):
852 if not data:
853 try:
854 filename = sys.argv[1]
855 f = open(filename)
856 data = f.read()
857 f.close()
858 except IndexError:
859 print "Reading from standard input (type EOF to end):"
860 data = sys.stdin.read()
861
862 if lexer:
863 _input = lexer.input
864 else:
865 _input = input
866 _input(data)
867 if lexer:
868 _token = lexer.token
869 else:
870 _token = token
871
872 while 1:
873 tok = _token()
874 if not tok: break
875 print "(%s,%r,%d,%d)" % (tok.type, tok.value, tok.lineno,tok.lexpos)
876
877
878
879
880
881
882
883
884
886 def set_doc(f):
887 if callable(r):
888 f.__doc__ = r.__doc__
889 else:
890 f.__doc__ = r
891 return f
892 return set_doc
893
894
895 Token = TOKEN
896