1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 __version__ = "2.2"
26
27 import re, sys, types
28
29
30 _is_identifier = re.compile(r'^[a-zA-Z0-9_]+$')
31
32
33
34
35
36 try:
37 _INSTANCETYPE = (types.InstanceType, types.ObjectType)
38 except AttributeError:
39 _INSTANCETYPE = types.InstanceType
41
42
43
46 self.args = (message,)
47 self.text = s
48
49
52 return "LexToken(%s,%r,%d,%d)" % (self.type,self.value,self.lineno,self.lexpos)
54 return str(self)
57
58
59
60
61
62
63
64
65
66
69 self.lexre = None
70
71
72
73 self.lexretext = None
74 self.lexstatere = {}
75 self.lexstateretext = {}
76 self.lexstate = "INITIAL"
77 self.lexstatestack = []
78 self.lexstateinfo = None
79 self.lexstateignore = {}
80 self.lexstateerrorf = {}
81 self.lexreflags = 0
82 self.lexdata = None
83 self.lexpos = 0
84 self.lexlen = 0
85 self.lexerrorf = None
86 self.lextokens = None
87 self.lexignore = ""
88 self.lexliterals = ""
89 self.lexmodule = None
90 self.lineno = 1
91 self.lexdebug = 0
92 self.lexoptimize = 0
93
94 - def clone(self,object=None):
95 c = Lexer()
96 c.lexstatere = self.lexstatere
97 c.lexstateinfo = self.lexstateinfo
98 c.lexstateretext = self.lexstateretext
99 c.lexstate = self.lexstate
100 c.lexstatestack = self.lexstatestack
101 c.lexstateignore = self.lexstateignore
102 c.lexstateerrorf = self.lexstateerrorf
103 c.lexreflags = self.lexreflags
104 c.lexdata = self.lexdata
105 c.lexpos = self.lexpos
106 c.lexlen = self.lexlen
107 c.lextokens = self.lextokens
108 c.lexdebug = self.lexdebug
109 c.lineno = self.lineno
110 c.lexoptimize = self.lexoptimize
111 c.lexliterals = self.lexliterals
112 c.lexmodule = self.lexmodule
113
114
115
116
117
118 if object:
119 newtab = { }
120 for key, ritem in self.lexstatere.items():
121 newre = []
122 for cre, findex in ritem:
123 newfindex = []
124 for f in findex:
125 if not f or not f[0]:
126 newfindex.append(f)
127 continue
128 newfindex.append((getattr(object,f[0].__name__),f[1]))
129 newre.append((cre,newfindex))
130 newtab[key] = newre
131 c.lexstatere = newtab
132 c.lexstateerrorf = { }
133 for key, ef in self.lexstateerrorf.items():
134 c.lexstateerrorf[key] = getattr(object,ef.__name__)
135 c.lexmodule = object
136
137
138 c.begin(c.lexstate)
139 return c
140
141
142
143
145 tf = open(tabfile+".py","w")
146 tf.write("# %s.py. This file automatically created by PLY (version %s). Don't edit!\n" % (tabfile,__version__))
147 tf.write("_lextokens = %s\n" % repr(self.lextokens))
148 tf.write("_lexreflags = %s\n" % repr(self.lexreflags))
149 tf.write("_lexliterals = %s\n" % repr(self.lexliterals))
150 tf.write("_lexstateinfo = %s\n" % repr(self.lexstateinfo))
151
152 tabre = { }
153 for key, lre in self.lexstatere.items():
154 titem = []
155 for i in range(len(lre)):
156 titem.append((self.lexstateretext[key][i],_funcs_to_names(lre[i][1])))
157 tabre[key] = titem
158
159 tf.write("_lexstatere = %s\n" % repr(tabre))
160 tf.write("_lexstateignore = %s\n" % repr(self.lexstateignore))
161
162 taberr = { }
163 for key, ef in self.lexstateerrorf.items():
164 if ef:
165 taberr[key] = ef.__name__
166 else:
167 taberr[key] = None
168 tf.write("_lexstateerrorf = %s\n" % repr(taberr))
169 tf.close()
170
171
172
173
175 exec "import %s as lextab" % tabfile
176 self.lextokens = lextab._lextokens
177 self.lexreflags = lextab._lexreflags
178 self.lexliterals = lextab._lexliterals
179 self.lexstateinfo = lextab._lexstateinfo
180 self.lexstateignore = lextab._lexstateignore
181 self.lexstatere = { }
182 self.lexstateretext = { }
183 for key,lre in lextab._lexstatere.items():
184 titem = []
185 txtitem = []
186 for i in range(len(lre)):
187 titem.append((re.compile(lre[i][0],lextab._lexreflags),_names_to_funcs(lre[i][1],fdict)))
188 txtitem.append(lre[i][0])
189 self.lexstatere[key] = titem
190 self.lexstateretext[key] = txtitem
191 self.lexstateerrorf = { }
192 for key,ef in lextab._lexstateerrorf.items():
193 self.lexstateerrorf[key] = fdict[ef]
194 self.begin('INITIAL')
195
196
197
198
205
206
207
208
210 if not self.lexstatere.has_key(state):
211 raise ValueError, "Undefined state"
212 self.lexre = self.lexstatere[state]
213 self.lexretext = self.lexstateretext[state]
214 self.lexignore = self.lexstateignore.get(state,"")
215 self.lexerrorf = self.lexstateerrorf.get(state,None)
216 self.lexstate = state
217
218
219
220
222 self.lexstatestack.append(self.lexstate)
223 self.begin(state)
224
225
226
227
229 self.begin(self.lexstatestack.pop())
230
231
232
233
235 return self.lexstate
236
237
238
239
242
243
244
245
246
247
248
249
251
252 lexpos = self.lexpos
253 lexlen = self.lexlen
254 lexignore = self.lexignore
255 lexdata = self.lexdata
256
257 while lexpos < lexlen:
258
259 if lexdata[lexpos] in lexignore:
260 lexpos += 1
261 continue
262
263
264 for lexre,lexindexfunc in self.lexre:
265 m = lexre.match(lexdata,lexpos)
266 if not m: continue
267
268
269 self.lexmatch = m
270
271
272 tok = LexToken()
273 tok.value = m.group()
274 tok.lineno = self.lineno
275 tok.lexpos = lexpos
276 tok.lexer = self
277
278 lexpos = m.end()
279 i = m.lastindex
280 func,tok.type = lexindexfunc[i]
281 self.lexpos = lexpos
282
283 if not func:
284
285 if tok.type: return tok
286 break
287
288
289 if not callable(func):
290 break
291
292
293 newtok = func(tok)
294
295
296 if not newtok:
297 lexpos = self.lexpos
298 break
299
300
301 if not self.lexoptimize:
302 if not self.lextokens.has_key(newtok.type):
303 raise LexError, ("%s:%d: Rule '%s' returned an unknown token type '%s'" % (
304 func.func_code.co_filename, func.func_code.co_firstlineno,
305 func.__name__, newtok.type),lexdata[lexpos:])
306
307 return newtok
308 else:
309
310 if lexdata[lexpos] in self.lexliterals:
311 tok = LexToken()
312 tok.value = lexdata[lexpos]
313 tok.lineno = self.lineno
314 tok.lexer = self
315 tok.type = tok.value
316 tok.lexpos = lexpos
317 self.lexpos = lexpos + 1
318 return tok
319
320
321 if self.lexerrorf:
322 tok = LexToken()
323 tok.value = self.lexdata[lexpos:]
324 tok.lineno = self.lineno
325 tok.type = "error"
326 tok.lexer = self
327 tok.lexpos = lexpos
328 self.lexpos = lexpos
329 newtok = self.lexerrorf(tok)
330 if lexpos == self.lexpos:
331
332 raise LexError, ("Scanning error. Illegal character '%s'" % (lexdata[lexpos]), lexdata[lexpos:])
333 lexpos = self.lexpos
334 if not newtok: continue
335 return newtok
336
337 self.lexpos = lexpos
338 raise LexError, ("Illegal character '%s' at index %d" % (lexdata[lexpos],lexpos), lexdata[lexpos:])
339
340 self.lexpos = lexpos + 1
341 if self.lexdata is None:
342 raise RuntimeError, "No input string given with input()"
343 return None
344
345
346
347
348
349
350
351
352
354 import os.path
355 base,ext = os.path.splitext(filename)
356 if ext != '.py': return 1
357
358 try:
359 f = open(filename)
360 lines = f.readlines()
361 f.close()
362 except IOError:
363 return 1
364
365 fre = re.compile(r'\s*def\s+(t_[a-zA-Z_0-9]*)\(')
366 sre = re.compile(r'\s*(t_[a-zA-Z_0-9]*)\s*=')
367 counthash = { }
368 linen = 1
369 noerror = 1
370 for l in lines:
371 m = fre.match(l)
372 if not m:
373 m = sre.match(l)
374 if m:
375 name = m.group(1)
376 prev = counthash.get(name)
377 if not prev:
378 counthash[name] = linen
379 else:
380 print "%s:%d: Rule %s redefined. Previously defined on line %d" % (filename,linen,name,prev)
381 noerror = 0
382 linen += 1
383 return noerror
384
385
386
387
388
389
390
391
393 result = []
394 for f in funclist:
395 if f and f[0]:
396 result.append((f[0].__name__,f[1]))
397 else:
398 result.append(f)
399 return result
400
401
402
403
404
405
406
407
409 result = []
410 for n in namelist:
411 if n and n[0]:
412 result.append((fdict[n[0]],n[1]))
413 else:
414 result.append(n)
415 return result
416
417
418
419
420
421
422
423
424
453
454
455
456
457
458
459
460
461
462
464 nonstate = 1
465 parts = s.split("_")
466 for i in range(1,len(parts)):
467 if not names.has_key(parts[i]) and parts[i] != 'ANY': break
468 if i > 1:
469 states = tuple(parts[1:i])
470 else:
471 states = ('INITIAL',)
472
473 if 'ANY' in states:
474 states = tuple(names.keys())
475
476 tokenname = "_".join(parts[i:])
477 return (states,tokenname)
478
479
480
481
482
483
484 -def lex(module=None,object=None,debug=0,optimize=0,lextab="lextab",reflags=0,nowarn=0):
485 global lexer
486 ldict = None
487 stateinfo = { 'INITIAL' : 'inclusive'}
488 error = 0
489 files = { }
490 lexobj = Lexer()
491 lexobj.lexdebug = debug
492 lexobj.lexoptimize = optimize
493 global token,input
494
495 if nowarn: warn = 0
496 else: warn = 1
497
498 if object: module = object
499
500 if module:
501
502 if isinstance(module, types.ModuleType):
503 ldict = module.__dict__
504 elif isinstance(module, _INSTANCETYPE):
505 _items = [(k,getattr(module,k)) for k in dir(module)]
506 ldict = { }
507 for (i,v) in _items:
508 ldict[i] = v
509 else:
510 raise ValueError,"Expected a module or instance"
511 lexobj.lexmodule = module
512
513 else:
514
515 try:
516 raise RuntimeError
517 except RuntimeError:
518 e,b,t = sys.exc_info()
519 f = t.tb_frame
520 f = f.f_back
521 ldict = f.f_globals
522
523 if optimize and lextab:
524 try:
525 lexobj.readtab(lextab,ldict)
526 token = lexobj.token
527 input = lexobj.input
528 lexer = lexobj
529 return lexobj
530
531 except ImportError:
532 pass
533
534
535 if (module and isinstance(module,_INSTANCETYPE)):
536 tokens = getattr(module,"tokens",None)
537 states = getattr(module,"states",None)
538 literals = getattr(module,"literals","")
539 else:
540 tokens = ldict.get("tokens",None)
541 states = ldict.get("states",None)
542 literals = ldict.get("literals","")
543
544 if not tokens:
545 raise SyntaxError,"lex: module does not define 'tokens'"
546 if not (isinstance(tokens,types.ListType) or isinstance(tokens,types.TupleType)):
547 raise SyntaxError,"lex: tokens must be a list or tuple."
548
549
550 lexobj.lextokens = { }
551 if not optimize:
552 for n in tokens:
553 if not _is_identifier.match(n):
554 print "lex: Bad token name '%s'" % n
555 error = 1
556 if warn and lexobj.lextokens.has_key(n):
557 print "lex: Warning. Token '%s' multiply defined." % n
558 lexobj.lextokens[n] = None
559 else:
560 for n in tokens: lexobj.lextokens[n] = None
561
562 if debug:
563 print "lex: tokens = '%s'" % lexobj.lextokens.keys()
564
565 try:
566 for c in literals:
567 if not (isinstance(c,types.StringType) or isinstance(c,types.UnicodeType)) or len(c) > 1:
568 print "lex: Invalid literal %s. Must be a single character" % repr(c)
569 error = 1
570 continue
571
572 except TypeError:
573 print "lex: Invalid literals specification. literals must be a sequence of characters."
574 error = 1
575
576 lexobj.lexliterals = literals
577
578
579 if states:
580 if not (isinstance(states,types.TupleType) or isinstance(states,types.ListType)):
581 print "lex: states must be defined as a tuple or list."
582 error = 1
583 else:
584 for s in states:
585 if not isinstance(s,types.TupleType) or len(s) != 2:
586 print "lex: invalid state specifier %s. Must be a tuple (statename,'exclusive|inclusive')" % repr(s)
587 error = 1
588 continue
589 name, statetype = s
590 if not isinstance(name,types.StringType):
591 print "lex: state name %s must be a string" % repr(name)
592 error = 1
593 continue
594 if not (statetype == 'inclusive' or statetype == 'exclusive'):
595 print "lex: state type for state %s must be 'inclusive' or 'exclusive'" % name
596 error = 1
597 continue
598 if stateinfo.has_key(name):
599 print "lex: state '%s' already defined." % name
600 error = 1
601 continue
602 stateinfo[name] = statetype
603
604
605 tsymbols = [f for f in ldict.keys() if f[:2] == 't_' ]
606
607
608
609 funcsym = { }
610 strsym = { }
611 toknames = { }
612
613 for s in stateinfo.keys():
614 funcsym[s] = []
615 strsym[s] = []
616
617 ignore = { }
618 errorf = { }
619
620 if len(tsymbols) == 0:
621 raise SyntaxError,"lex: no rules of the form t_rulename are defined."
622
623 for f in tsymbols:
624 t = ldict[f]
625 states, tokname = _statetoken(f,stateinfo)
626 toknames[f] = tokname
627
628 if callable(t):
629 for s in states: funcsym[s].append((f,t))
630 elif (isinstance(t, types.StringType) or isinstance(t,types.UnicodeType)):
631 for s in states: strsym[s].append((f,t))
632 else:
633 print "lex: %s not defined as a function or string" % f
634 error = 1
635
636
637 for f in funcsym.values():
638 f.sort(lambda x,y: cmp(x[1].func_code.co_firstlineno,y[1].func_code.co_firstlineno))
639
640
641 for s in strsym.values():
642 s.sort(lambda x,y: (len(x[1]) < len(y[1])) - (len(x[1]) > len(y[1])))
643
644 regexs = { }
645
646
647 for state in stateinfo.keys():
648 regex_list = []
649
650
651 for fname, f in funcsym[state]:
652 line = f.func_code.co_firstlineno
653 file = f.func_code.co_filename
654 files[file] = None
655 tokname = toknames[fname]
656
657 ismethod = isinstance(f, types.MethodType)
658
659 if not optimize:
660 nargs = f.func_code.co_argcount
661 if ismethod:
662 reqargs = 2
663 else:
664 reqargs = 1
665 if nargs > reqargs:
666 print "%s:%d: Rule '%s' has too many arguments." % (file,line,f.__name__)
667 error = 1
668 continue
669
670 if nargs < reqargs:
671 print "%s:%d: Rule '%s' requires an argument." % (file,line,f.__name__)
672 error = 1
673 continue
674
675 if tokname == 'ignore':
676 print "%s:%d: Rule '%s' must be defined as a string." % (file,line,f.__name__)
677 error = 1
678 continue
679
680 if tokname == 'error':
681 errorf[state] = f
682 continue
683
684 if f.__doc__:
685 if not optimize:
686 try:
687 c = re.compile("(?P<%s>%s)" % (f.__name__,f.__doc__), re.VERBOSE | reflags)
688 if c.match(""):
689 print "%s:%d: Regular expression for rule '%s' matches empty string." % (file,line,f.__name__)
690 error = 1
691 continue
692 except re.error,e:
693 print "%s:%d: Invalid regular expression for rule '%s'. %s" % (file,line,f.__name__,e)
694 if '#' in f.__doc__:
695 print "%s:%d. Make sure '#' in rule '%s' is escaped with '\\#'." % (file,line, f.__name__)
696 error = 1
697 continue
698
699 if debug:
700 print "lex: Adding rule %s -> '%s' (state '%s')" % (f.__name__,f.__doc__, state)
701
702
703
704
705 regex_list.append("(?P<%s>%s)" % (f.__name__,f.__doc__))
706 else:
707 print "%s:%d: No regular expression defined for rule '%s'" % (file,line,f.__name__)
708
709
710 for name,r in strsym[state]:
711 tokname = toknames[name]
712
713 if tokname == 'ignore':
714 ignore[state] = r
715 continue
716
717 if not optimize:
718 if tokname == 'error':
719 raise SyntaxError,"lex: Rule '%s' must be defined as a function" % name
720 error = 1
721 continue
722
723 if not lexobj.lextokens.has_key(tokname) and tokname.find("ignore_") < 0:
724 print "lex: Rule '%s' defined for an unspecified token %s." % (name,tokname)
725 error = 1
726 continue
727 try:
728 c = re.compile("(?P<%s>%s)" % (name,r),re.VERBOSE | reflags)
729 if (c.match("")):
730 print "lex: Regular expression for rule '%s' matches empty string." % name
731 error = 1
732 continue
733 except re.error,e:
734 print "lex: Invalid regular expression for rule '%s'. %s" % (name,e)
735 if '#' in r:
736 print "lex: Make sure '#' in rule '%s' is escaped with '\\#'." % name
737
738 error = 1
739 continue
740 if debug:
741 print "lex: Adding rule %s -> '%s' (state '%s')" % (name,r,state)
742
743 regex_list.append("(?P<%s>%s)" % (name,r))
744
745 if not regex_list:
746 print "lex: No rules defined for state '%s'" % state
747 error = 1
748
749 regexs[state] = regex_list
750
751
752 if not optimize:
753 for f in files.keys():
754 if not _validate_file(f):
755 error = 1
756
757 if error:
758 raise SyntaxError,"lex: Unable to build lexer."
759
760
761
762
763
764
765 for state in regexs.keys():
766 lexre, re_text = _form_master_re(regexs[state],reflags,ldict)
767 lexobj.lexstatere[state] = lexre
768 lexobj.lexstateretext[state] = re_text
769 if debug:
770 for i in range(len(re_text)):
771 print "lex: state '%s'. regex[%d] = '%s'" % (state, i, re_text[i])
772
773
774 for state,type in stateinfo.items():
775 if state != "INITIAL" and type == 'inclusive':
776 lexobj.lexstatere[state].extend(lexobj.lexstatere['INITIAL'])
777 lexobj.lexstateretext[state].extend(lexobj.lexstateretext['INITIAL'])
778
779 lexobj.lexstateinfo = stateinfo
780 lexobj.lexre = lexobj.lexstatere["INITIAL"]
781 lexobj.lexretext = lexobj.lexstateretext["INITIAL"]
782
783
784 lexobj.lexstateignore = ignore
785 lexobj.lexignore = lexobj.lexstateignore.get("INITIAL","")
786
787
788 lexobj.lexstateerrorf = errorf
789 lexobj.lexerrorf = errorf.get("INITIAL",None)
790 if warn and not lexobj.lexerrorf:
791 print "lex: Warning. no t_error rule is defined."
792
793
794 for s,stype in stateinfo.items():
795 if stype == 'exclusive':
796 if warn and not errorf.has_key(s):
797 print "lex: Warning. no error rule is defined for exclusive state '%s'" % s
798 if warn and not ignore.has_key(s) and lexobj.lexignore:
799 print "lex: Warning. no ignore rule is defined for exclusive state '%s'" % s
800 elif stype == 'inclusive':
801 if not errorf.has_key(s):
802 errorf[s] = errorf.get("INITIAL",None)
803 if not ignore.has_key(s):
804 ignore[s] = ignore.get("INITIAL","")
805
806
807
808 token = lexobj.token
809 input = lexobj.input
810 lexer = lexobj
811
812
813 if lextab and optimize:
814 lexobj.writetab(lextab)
815
816 return lexobj
817
818
819
820
821
822
823
824 -def runmain(lexer=None,data=None):
825 if not data:
826 try:
827 filename = sys.argv[1]
828 f = open(filename)
829 data = f.read()
830 f.close()
831 except IndexError:
832 print "Reading from standard input (type EOF to end):"
833 data = sys.stdin.read()
834
835 if lexer:
836 _input = lexer.input
837 else:
838 _input = input
839 _input(data)
840 if lexer:
841 _token = lexer.token
842 else:
843 _token = token
844
845 while 1:
846 tok = _token()
847 if not tok: break
848 print "(%s,%r,%d,%d)" % (tok.type, tok.value, tok.lineno,tok.lexpos)
849
850
851
852
853
854
855
856
857
859 def set_doc(f):
860 f.__doc__ = r
861 return f
862 return set_doc
863
864
865 Token = TOKEN
866