Logo Search packages:      
Sourcecode: zope version File versions

kjParser.py

#
# python for parser interpretation
#  Copyright Aaron Robert Watters, 1994
#

# BUGS:
# Lexical error handling is not nice
# Parse error handling is not nice
#
# Lex analysis may be slow for big grammars
# Setting case sensitivity for keywords MUST happen BEFORE
#   declaration of keywords.

import kjSet
import string
import re
import string

# set this flag for regression testing at each load
RUNTESTS = 0

# set this flag to enable warning for default reductions
WARNONDEFAULTS = 0

# some local constants
TERMFLAG = -1 # FLAG FOR TERMINAL
NOMATCHFLAG = -2 # FLAG FOR NO MATCH IN FSM
MOVETOFLAG = -3 # FLAG FOR "SHIFT" IN SN FSM
REDUCEFLAG = -4 # FLAG FOR REDUCTION IN FSM
TRANSFLAG = -5 # FLAG FOR TRANSIENT STATE IN FSM
KEYFLAG = -6 # FLAG FOR KEYWORD
NONTERMFLAG = -7 # FLAG FOR NONTERMINAL
TERMFLAG = -8 # FLAG FOR TERMINAL
EOFFLAG = "*" # FLAG for End of file

# set this string to the Module name (filename)
# used for dumping reconstructable objects
THISMODULE = "kjParser"

# regular expression for matching whitespace
WHITERE = "["+string.whitespace+"]+"
WHITEREGEX = re.compile(WHITERE)

# local errors
LexTokenError = "LexTokenError" # may happen on bad string
UnkTermError = "UnkTermError" # ditto
BadPunctError= "BadPunctError" # if try to make whitespace a punct
ParseInitError = "ParseInitError" # shouldn't happen?
#EOFError # may happen on bad string
FlowError = "FlowError" # shouldn't happen!!! (bug)
#SyntaxError # may happen on bad string
#TypeError
ReductError = "ReductError" # shouldn't happen?
NondetError = "NondetError" # shouldn't happen?

# the end of file is interpreted in the lexical stream as
# a terminal...
#  this should be appended to the lexical stream:
ENDOFFILETOKEN = (TERMFLAG, EOFFLAG)

# in FSM use the following terminal to indicate eof
ENDOFFILETERM = (ENDOFFILETOKEN, EOFFLAG)

# Utility function for match conversion from regex to re
def RMATCH(re, key, start=0):
    #print "RMATCH: %s -> %s <- start=%s" % (re.pattern, key, start)
    group = re.match(key, start)
    if group is None:
        #print "RMATCH: -1"
        return -1
    len = group.end() - group.start()
    #print "RMATCH: %s (%s)" % (len, group.group())
    return len

# utility function for error diagnostics
def DumpStringWindow(Str, Pos, Offset=15):
    L = []
    L.append("near ::")
    start = Pos-Offset
    end = Pos+Offset
    if start<0: start = 0
    if end>len(Str): end = len(Str)
    L.append(`Str[start:Pos]`+"*"+`Str[Pos:end]`)
    from string import join
    return join(L, "\n")

# lexical dictionary class
#   this data structure is used by lexical parser below.
#
#   basic operations:
#      LD.punctuation(string)
#         registers a string as a punctuation
#           EG: LD.punctuation(":")
#      Punctuations are treated as a special kind of keyword
#      that is recognized even when not surrounded by whitespace.
#      IE, "xend" will not be recognized as "x end", but "x;" will be
#          recognized as "x ;" if "end" is a regular keyword but
#          ";" is a punctuation.  Only single character punctuations
#          are supported (now), ie, ":=" must be recognized as
#          ":" "=" above the lexical level.
#
#      LD.comment(compiled_reg_expression)
#         registers a comment pattern
#           EG LD.comment(regex.compile("--.*\n"))
#             asks to recognize ansi/sql comments like "-- correct?\n"
#
#      LD.keyword(keyword_string, canonicalstring)
#         specifies a keyword string that should map to the canonicalstring
#         when translated to the lexical stream.
#           EG: LD.keyword("begin","BEGIN"); LD.keyword("BEGIN","BEGIN")
#            will recognize upcase or downcase begins, not mixed case.
#            (automatic upcasing is allowed below at parser level).
#
#      LD[compiled_reg_expression] = (TerminalFlag, Function) # assignment!
#         specifies a regular expression that should be associated
#         with the lexical terminal marker TerminalFlag
#           EG: LD[regex.compile("[0-9]+")] = ("integer",string.atoi)
#         the Function should be a function on one string argument
#         that interprets the matching string as a value. if None is
#         given, just the string itself will be used as the
#         interpretation.  (a better choice above would be a function
#         which "tries" atoi first and uses atol on overflow).
#      NOTE: ambiguity among regular expressions will be decided
#         arbitrarily (fix?).
#
#      LD[string] # retrieval!
#         returns ((KEYFLAG, Keywordstring), Keywordstring)
#          if the (entire) string matches a keyword or a 
#          punctuation Keywordstring.
#         otherwise returns ((TERMFLAG, Terminalname), value)
#          if the (entire) string matches the regular expression for
#          a terminal flaged by Terminalname; value is the interpreted
#          value.  TerminalFlag better be something other than
#          KEYFLAG!
#         otherwise raises an error!
#         comments not filtered here!
#
# the following additional functions are used for autodocumentation
# in declaring rules, etcetera.
#     begin = LD.keyword("begin")
#        sets variable "begin" to (KEYFLAG, "BEGIN") if
#        "begin" maps to keyword "BEGIN" in LD
#     integer = LD.terminal("integer")
#        sets variable integer to ("integer", Function)
#        if  "integer" is a registered terminal Function is
#        its associated interpretation function.
#
class LexDictionary:

  def __init__(self):
    # commentpatterns is simply a list of compiled regular expressions
    # that represent comments
    self.commentpatterns = []
    # commentstrings is used for debugging/dumping/reconstruction etc.
    self.commentstrings = []
    # punctuationlist is a string of punctuations
    self.punctuationlist = ""
    # keywordmap is a dictionary mapping recognized keyword strings
    # and punctuations to their constant representations.
    self.keywordmap = KeywordDict()
    # regexprlist is a list of triples (regex,Flag,function) mapping
    # regular expressions to their flag and interpreter function.
    self.regexprlist = []

  def Dump(self):
    print "comments = ", self.commentstrings
    print "punctuations = ", self.punctuationlist
    print "keywordmap ="
    self.keywordmap.Dump()
    print "regexprlist =", self.regexprlist

  def __getitem__(self,key):
    # try to match string to a keyword
    try:
       return self.keywordmap[key]
    except KeyError:
       # try to match a regular expression
       found = 0 # so far not found
       length = len(key)
       for triple in self.regexprlist:
          (regexpr, Flag, Function) = triple
          index = RMATCH(regexpr,key)
          if index == length:
             found = 1
             # use the function to interpret the string, if given
             if Function != None:
                value = Function(key)
             else:
                value = key
             # NONLOCAL RETURN
             return (Flag, value)
       #endfor
       raise LexTokenError, "no match for string: " + `key`
  #enddef __getitem__

  # LD.keyword("this") will make a new keyword "this" if not found
  #
  def keyword(self,str):
    # upcase the string, if needed
    if self.keywordmap.caseInsensitive:
       str = string.upper(str)
    if not self.keywordmap.has_key(str):
       # redundancy for to avoid excess construction during parsing
       token = (KEYFLAG,str)
       self.keywordmap[str] = (token,str)
    else:
       (token, str2) = self.keywordmap[str]
    return token

  # LD.terminal("this") will just look for "this"
  # LD.terminal("this", RE, F) will register a new terminal
  #   RE must be a compiled regular expression or string reg ex
  #   F must be an interpretation function
  #
  def terminal(self, string, RegExpr=None, Function=None):
    if RegExpr != None and Function != None:
       if type(RegExpr) == type(""):
          RegExpr = re.compile(RegExpr)
       self[ RegExpr ] = ( string, Function)
    for triple in self.regexprlist:
       (regexpr,token,Function) = triple
       if token[1] == string:
          # nonlocal exit
          return token
    #endfor
    # error if no exit by now
    raise UnkTermError, "no such terminal"

  def __setitem__(self,key,value):
    if type(key) == type(''):
       # if it's a string it must be a keyword
       if self.keywordmap.caseInsensitive:
          value = string.upper(value)
          key = string.upper(key)
       self.keywordmap[key] = ( (KEYFLAG, value), value)
    else:
       # otherwise it better be a compiled regular expression (not
       #verified)
       (Name, Function) = value
       Flag = (TERMFLAG, Name)
       regexpr = key
       self.regexprlist = self.regexprlist + \
                          [ (regexpr, Flag, Function) ]

  # register a regular expression as a comment
  def comment(self, string):
    # regexpr better be a uncompiled string regular expression! (not verified)
    regexpr = re.compile(string)
    self.commentpatterns = self.commentpatterns + [ regexpr ]
    self.commentstrings = self.commentstrings + [ string ]

  # register a string as a punctuation
  def punctuation(self,Instring):
    if type(Instring) != type("") or len(Instring)!=1:
       raise BadPunctError, "punctuation must be string of length 1"
    if Instring in string.whitespace:
       raise BadPunctError, "punctuation may not be whitespace"
    self.punctuationlist = self.punctuationlist + Instring
    return self.keyword(Instring)

  # testing and altering case sensitivity behavior
  def isCaseSensitive(self):
    return not self.keywordmap.caseInsensitive

  # setting case sensitivity MUST happen before keyword
  # declarations!
  def SetCaseSensitivity(self, Boolean):
    self.keywordmap.caseInsensitive = not Boolean

  # function to do same as __getitem__ above but looking _inside_ a string
  # instead of at the whole string
  # returns (token,skip)
  # where token is one of
  #  ((KEYFLAG,name),name) or ((TERMFLAG,termname),value)
  # and skip is the length of substring of string that matches thetoken
  def Token(self, String, StartPosition):
     finished = 0 # dummy, exit should be nonlocal
     totalOffset = 0
     while not finished:
        # flag EOF if past end of string?
        if len(String) <= StartPosition:
           return (ENDOFFILETERM, 0)
        # skip whitespace
        whitespacefound = 0
        skip = RMATCH(WHITEREGEX,String, StartPosition)
        if skip > 0:
           StartPosition = StartPosition + skip
           totalOffset = totalOffset + skip
           whitespacefound = 1
        # try to find comment, keyword, term in that order:
        # looking for comment
        commentfound = 0
        for commentexpr in self.commentpatterns:
           offset = RMATCH(commentexpr,String,StartPosition)
           if offset != -1:
              if offset<1:
                 info = DumpStringWindow(String,StartPosition)
                 raise LexTokenError, "zero length comment "+info
              commentfound = 1
              StartPosition = StartPosition + offset
              totalOffset = totalOffset + offset
        # looking for a keyword
        keypair = self.keywordmap.hasPrefix(String,StartPosition,
                      self.punctuationlist)
        if keypair != 0:
           return ( keypair[0], keypair[1] + totalOffset)
        # looking for terminal
        for (regexpr, Flag, Function) in self.regexprlist:
           offset = RMATCH(regexpr,String,StartPosition)
           if offset != -1:
              matchstring = String[StartPosition : offset+StartPosition]
              if Function != None:
                 value = Function(matchstring)
              else:
                 value = matchstring
              return ((Flag, value) , offset + totalOffset)
        if not (commentfound or whitespacefound):
           info = DumpStringWindow(String,StartPosition)
           raise LexTokenError, "Lexical parse failure "+info
     #endwhile
  #enddef
#endclass LexDictionary

# alternate, experimental implementation

class lexdictionary:

   def __init__(self):
       self.skip = ""
       self.commentstrings = []
       self.punctuationlist = ""
       self.keywordmap = KeywordDict()
       self.termlist = [] # list of (term, regex, flag, interpret_fn)
       self.uncompiled = 1 # only compile after full initialization.
       self.laststring= self.lastindex= self.lastresult = None

   def Dump(self, *k):
       raise "sorry", "not implemented"
   __getitem__ = Dump

   def keyword(self, str):
       kwm = self.keywordmap
       if kwm.caseInsensitive:
          str = string.upper(str)
       try:
          (token, str2) = kwm[str]
       except:
          token = (KEYFLAG, str)
          self.keywordmap[str] = (token,str)
       return token

   def terminal(self, str, regexstr=None, Function=None):
       if regexstr is not None:
          flag = (TERMFLAG, str)
          self.termlist.append( (str, regexstr, flag, Function) )
          return flag
       else:
          for (s,fl,fn) in self.termlist:
              if fl[1]==str:
                 return fl
              else:
                 raise UnkTermError, "no such terminal"

   __setitem__ = Dump

   def comment(self, str):
       self.commentstrings.append(str)

   def punctuation(self, Instring):
       if type(Instring) != type("") or len(Instring)!=1:
         raise BadPunctError, "punctuation must be string of length 1"
       if Instring in string.whitespace:
         raise BadPunctError, "punctuation may not be whitespace"
       self.punctuationlist = self.punctuationlist + Instring
       return self.keyword(Instring)

   def SetCaseSensitivity(self, Boolean):
       self.keywordmap.caseInsensitive = not Boolean

   def Token(self, String, StartPosition):
       # shortcut for reductions.
       if self.laststring is String and self.lastindex == StartPosition:
          #print "lastresult", self.lastresult
          return self.lastresult
       self.lastindex = StartPosition
       self.laststring = String
       #print `String[StartPosition: StartPosition+60]`

       if self.uncompiled:
          self.compile()
          self.uncompiled = None
       finished = 0
       totalOffset = 0
       skipprog = self.skipprog
       keypairfn = self.keywordmap.hasPrefix
       punctlist = self.punctuationlist
       termregex = self.termregex
       while not finished:
          if len(String) <= StartPosition:
             result = self.lastresult = (ENDOFFILETERM, 0)
             return result
          # skip ws and comments 
          #skip = skipprog.match(String, StartPosition)
          skip = RMATCH(skipprog, String, StartPosition)
          if skip>0:
             if skip==0:
                info = DumpStringWindow(String, StartPosition)
                raise LexTokenError, \
                  "zero length whitespace or comment "+info
             StartPosition = StartPosition + skip
             totalOffset = totalOffset + skip
             continue
          # look for keyword
          keypair = keypairfn(String, StartPosition, punctlist)
          if keypair!=0:
             #print "keyword", keypair
             result = self.lastresult = (keypair[0], keypair[1]+totalOffset)
             return result
          # look for terminal
          #print "Termregex: %s --> %s <-- start=%s" % (termregex.pattern, String, StartPosition)
          offset = termregex.match(String, StartPosition)
          if offset is not None:
             g = offset.group
             for (term, regex, flag, fn) in self.termlist:
                 test = g(term)
                 if test:
                    #print "terminal", test
                    if fn is not None:
                       value = fn(test)
                    else:
                       value = test
                    result = self.lastresult = (
                       (flag, value), offset.end() - offset.start() + totalOffset)
                    return result
          # error if we get here
          info = DumpStringWindow(String, StartPosition)
          raise LexTokenError, "Lexical token not found "+info

   def isCaseSensitive(self):
       return not self.keywordmap.caseInsensitive

   def compile(self):
       from string import joinfields, whitespace
       import re
       skipregexen = self.commentstrings + [WHITERE]
       skipregex = "(" + joinfields(skipregexen, ")|(") + ")"
       #print skipregex; import sys; sys.exit(1)
       self.skipprog = re.compile(skipregex)
       termregexen = []
       termnames = []
       for (term, rgex, flag, fn) in self.termlist:
           fragment = "(?P<%s>%s)" % (term, rgex)
           termregexen.append(fragment)
           termnames.append(term)
       termregex = joinfields(termregexen, "|")
       self.termregex = re.compile(termregex)
       self.termnames = termnames

LexDictionary = lexdictionary ##### test!

# a utility class: dictionary of prefixes
#  should be generalized to allow upcasing of keyword matches
class KeywordDict:

  def __init__(self, caseInsensitive = 0):
     self.FirstcharDict = {}
     self.KeyDict = {}
     self.caseInsensitive = caseInsensitive

  def Dump(self):
     if self.caseInsensitive:
        print "  case insensitive"
     else:
        print "  case sensitive"
     keys = self.KeyDict.keys()
     print "  keyDict has ", len(keys), " elts"
     for key in keys:
        print "     ", key," maps to ",self.KeyDict[key]
     firstchars = self.FirstcharDict.keys()
     print "  firstcharDict has ", len(firstchars), " elts"
     for char in firstchars:
        print "     ", char," maps to ",self.FirstcharDict[char]

  # set item assumes value has correct case already, if case sensitive
  def __setitem__(self, key, value):
     if len(key)<1:
        raise LexTokenError, "Keyword of length 0"
     if self.caseInsensitive:
        KEY = string.upper(key)
     else:
        KEY = key
     firstchar = KEY[0:1]
     if self.FirstcharDict.has_key(firstchar):
        self.FirstcharDict[firstchar] = \
          self.FirstcharDict[firstchar] + [(KEY, value)]
     else:
        self.FirstcharDict[firstchar] = [(KEY, value)]
     self.KeyDict[KEY] = value

  # if String has a registered keyword at start position
  #  return its canonical representation and offset, else 0
  # keywords that are not punctuations should be 
  # recognized only if followed
  # by a punctuation or whitespace char
  #
  def hasPrefix(self,String,StartPosition,punctuationlist):
     First = String[StartPosition:StartPosition+1]
     fcd = self.FirstcharDict
     caseins = self.caseInsensitive
     if caseins:
        First = string.upper(First)
     if fcd.has_key(First):
        Keylist = fcd[First]
     else:
        return 0
     for (key,value) in Keylist:
           offset = len(key)
           EndPosition = StartPosition+offset
           match = String[StartPosition : EndPosition]
           if caseins:
              match = string.upper(match)
           if key == match:
              if len(key)==1 and key in punctuationlist:
                 # punctuations are recognized regardless of nextchar
                 return (value,offset)
              else:
                 # nonpuncts must have punct or whitespace following
                 #(uses punct as single char convention)
                 if EndPosition == len(String):
                    return (value, offset)
                 else:
                    nextchar = String[EndPosition]
                    if nextchar in string.whitespace\
                     or nextchar in punctuationlist:
                       return (value, offset)
     return 0 # if no exit inside for loop, fail

  def __getitem__(self,key):
     if self.caseInsensitive:
        key = string.upper(key)
     return self.KeyDict[key]

  def has_key(self,key):
     if self.caseInsensitive:
        key = string.upper(key)
     return self.KeyDict.has_key(key)
#endclass KeywordDict:

# LexStringWalker walks through a string looking for
# substrings recognized by a lexical dictionary
#
#  ERROR REPORTING NEEDS IMPROVEMENT
class LexStringWalker:

  def __init__(self, String, LexDict):
    self.Position = 0
    self.NextPosition = 0
    self.String = String
    self.LexDict = LexDict
    self.PastEOF = 0
    self.Done = 0

  def DUMP(self):
    return DumpStringWindow(self.String,self.Position)

  #reset not defined

  def more(self):
    return not self.PastEOF

  def getmember(self):
    (Token,skip) = self.LexDict.Token(self.String, self.Position)
    self.NextPosition = self.Position + skip
    if Token == ENDOFFILETERM:
       self.PastEOF = 1
    return Token

  def next(self):
    if self.Done:
       data = self.DUMP()
       raise LexTokenError, "no next past end of file "+data
    elif self.PastEOF:
       self.Done=1
    elif self.NextPosition > self.Position:
       self.Position = self.NextPosition
    else:
       dummy = self.getmember()
       if self.NextPosition <= self.Position:
          data = self.DUMP()
          raise LexTokenError, "Lexical walker not advancing "+data
       self.Position = self.NextPosition

#endclass LexStringWalker

# the parse class:
#   Based loosely on Aho+Ullman, Principles of Compiler Design, Ch.6.
#    except that they don't describe how to handle boundary
#    conditions, I made them up myself.
#
#   Note: This could be implemented using just functions; it's implemented
#    as a class to facilitate diagnostics and debugging in case of
#    failures of various sorts.
#
# a parse accepts 
#   a rule list
#
#   a lexically analysed stream with methods
#     stream.getmember()  returns the current token on the stream
#     stream.next()  moves on to next token
#     stream.more()     returns false if current token is the last token
#
#   and a FSM (finite state machine) with methods
#     FSM.root_nonTerminal
#       the nonterminal at which to start parsing
#     FSM.initial_state
#       the initial state to start at
#     FSM.successful_final_state
#       the final state to go to upon successful parse
#     FSM.map(Current_State,Current_Token)
#       returns either
#          (TERMFLAG, 0)
#             if Current_State is terminal (final or reduction).
#          (NOMATCHFLAG, 0)
#             if Current_State is nonterminal, but the Current_Token
#             and Next_Token do not lead to a valid state in the FSM
#          (MOVETOFLAG, Next_State)
#             if Current_State is nonterminal and Current_Token,
#             Next_token map to Next_State from Current_State.
#          (REDUCEFLAG, Rulenum)
#             if Current_State indicates a reduction at Current_Token
#             for rule Rule number Rule
#
#    and a Stack with methods (replaced with dictionary)
#          (init: {-1:0} )
#       Stack.Top() returns top of stack (no pop)
#          ( Stack[Stack[-1]] )
#       Stack.Push(Object)
#          ( Stack[-1]=Stack[-1]+1; Stack[Stack[-1]]=Object )
#       Stack.MakeEmpty()
#          ( Stack[-1]=0 )
#       Stack.IsEmpty()
#          ( Stack[-1] == 0 )
#       Stack.Pop()
#          ( Stack[-1] = Stack[-1]-1 )
#       stack contents created by Parser will be of form (State,Value)
#       where Value was inserted at FSM state State.
#       Value of form either (KEYFLAG, Name)
#                            (NontermName, reductionvalue)
#                         or (TerminalName, value)
#
#    and an optional parameter Evaluate which if 0 indicates that
#       rules should be evaluated, otherwise indicates that rules
#       should just be reduced and the reduction structure should
#       be used as the result of the rule
#
# rule objects must support methods
#    Rule.reduce(Stack)
#       pops off the elements corresponding to the body of the Rule
#       from the stack and returns (NewStack,Red) where NewStack is
#       the stack minus the body and Red is the result of evaluating the
#       reduction function on this instance of the rule.
#    Rule.Nonterm
#       the nonterminal at the head of the rule

class ParserObj:

  # Evaluate determines whether rules should be evaluated
  # after reductions.  Context is an argument passed to the
  # list reduction function
  #
  def __init__(self, Rulelist, Stream, FSM, Stack, \
                Evaluate=1, \
                Context=None):
    self.Rules = Rulelist
    self.LexStream = Stream
    self.FSM = FSM
    self.Stack = Stack
    self.Context = Context

    # start with empty stack, initial_state, no nonterminal
    #self.Stack[-1] = 0#   self.Stack.MakeEmpty()
    self.Stack[:] = []
    self.State = FSM.initial_state
    self.currentNonterm = None
    self.Evaluate = Evaluate

  # DoOneReduction accepts tokens from the stream and pushes
  # them onto the stack until a reduction state is reached.
  #
  # Resolve the reduction
  #
  def DoOneReduction(self):
    current=self.State
    FSM=self.FSM
    Stack = self.Stack
    Context = self.Context
    Stream = self.LexStream
    # the internal FSM.StateTokenMap dictionary is used directly here.
    STMap = FSM.StateTokenMap
    #if FSM.final_state(current):
    #   raise ParseInitError, 'trying to reduce starting at final state'

    tokenVal = Stream.getmember()
    #print "tokenVal", tokenVal
    token = tokenVal[0]

    # push the token and traverse FSM until terminal state is reached
    #(flag, nextThing) = FSM.map(current, token)
    key = (current, token)
    try:
       (flag, nextThing) = STMap[key][0]
    except KeyError:
       flag = NOMATCHFLAG

    while flag == MOVETOFLAG:
       nextState = nextThing
       #print current, " shift ", token, 
       # no sanity check, possible infinite loop

       # push current token and next state
       ThingToPush = (nextState, tokenVal)
       #print "pushing ", ThingToPush
       #Stack[-1]=Stack[-1]+1; Stack[Stack[-1]]=ThingToPush
       Stack.append(ThingToPush)
       #Stack.Push( ThingToPush )

       # move to next token, next state
       Stream.next()
       # error if end of stream
       if not Stream.more(): # optimized Stream.PastEOF (?)
          data = Stream.DUMP()
          raise EOFError, 'end of stream during parse '+data

       current = nextState
       tokenVal = Stream.getmember()
       token = tokenVal[0]
       
       #MAP = FSM.map(current,token)
       key = (current, token)
       try:
          (flag, nextThing) = STMap[key][0]
       except KeyError:
          flag = NOMATCHFLAG

    # at end of while loop we should be at a reduction state

    if flag == REDUCEFLAG:
       rulenum = nextThing
       #print current, " reduce ", token, self.Rules[rulenum]
       # normal case
       # perform reduction
       rule = self.Rules[rulenum]
       Nonterm = rule.Nonterm
       self.currentNonterm = Nonterm
       (Stack, reduct) = rule.reduce( Stack , Context )
#       self.Stack = Stack #not needed, unless stack rep changes
       GotoState = self.GotoState(rule)
       # push the Gotostate and result of rule reduction on stack
       ThingToPush = (GotoState, (Nonterm, reduct) )
       # push the result of the reduction and exit normally
       #print "pushing ", ThingToPush
       #Stack[-1]=Stack[-1]+1; Stack[Stack[-1]]=ThingToPush
       Stack.append(ThingToPush)
       #Stack.Push(ThingToPush)
       self.State=GotoState
       return 1  # normal successful completion

    # some error cases
    elif flag == NOMATCHFLAG:
       self.ParseError(current,tokenVal, "nomatch1")
    #elif FSM.final_state(current):
    #   raise BadFinalError, 'unexpected final state reached in reduction'
    else:
       data = Stream.DUMP()
       s = """
           flag = %s
           map = %s """ % (flag, FSM.map(current,token))
       data = data + s
       raise FlowError, 'unexpected else '+data
  #enddef DoOneReduction

  # compute the state to goto after a reduction is performed
  # on a rule.
  # Algorithm: determine the state at beginning of reduction
  #  and the next state indicated by the head nonterminal of the rule.
  #  special case: empty stack and root nonterminal > success.
  #
  def GotoState(self, rule):
     FSM = self.FSM
     Stack = self.Stack
     Head = rule.Nonterm
     if len(Stack)==0: #Stack[-1]==0: #Stack.IsEmpty():
        BeforeState = FSM.initial_state
     else:
        BeforeState = Stack[-1][0] #Stack[Stack[-1]][0] #Stack.Top()[0]
     # is this right? if the stack is empty and the Head
     # is the root nonterm, then goto is final state
     if len(Stack)==0 and Head == FSM.root_nonTerminal:#Stack.isEmpty()
        Result = FSM.successful_final_state
     else:
        # consider eliminating the call to .map here? (efficiency)
        (flag, Result) = FSM.map(BeforeState, Head)
        if flag != MOVETOFLAG:
           #FSM.DUMP()
           self.ParseError(BeforeState, Head, "notmoveto")
     return Result

  def ParseError( self, State, Token, *rest):
    # make this parse error nicer (add diagnostic methods?)
    L = [""]
    L.append("*******************************")
    L.append("current state = "+`State`)
    L.append("expects: ")
    expects = ""
    for (flag,name) in self.FSM.Expects(State):
       if flag in (TERMFLAG, KEYFLAG):
          expects = expects + `name`+ ", "
    L.append(expects)
    L.append(`rest`)
    L.append("current token = " + `Token`)
    #print "Stack =", 
    #self.StackDump(5)
    #print
    from string import join
    data = self.LexStream.DUMP() + join(L, "\n")
    raise SyntaxError, 'unexpected token sequence.' + data

  def StackDump(self, N):
    Stack = self.Stack
    Topkey = len(Stack)
    if Topkey>N:
       Start = Topkey - N
    else:
       Start = 1
    for i in range(Start,Topkey+1):
       print " :: ", Stack[i],

  # execute parsing until done:
  def GO(self):
    while self.State != self.FSM.successful_final_state:
              #self.FSM.final_state(self.State):
       self.DoOneReduction()
    # should I check that stack has only one elt here?
    # return result of last reduction
    return self.Stack[-1][1] #self.Stack.Top()[1]

#endclass ParserObj

# function for declaring a variable to represent a nonterminal:
# eg Program = nonterminal("program") 
#  included for convenient autodocumentation
#
def nonterminal(string):
   return (NONTERMFLAG, string)

# declaring a terminal WITHOUT INSTALLING IT IN A LexDict
def termrep(string):
   return (TERMFLAG, string)

# the rule class
#  a rule is defined by a goal nonterminal marker of form
#    (NONTERMFLAG, Name)
#  and a list defining the body which must contain elts of form
#    (KEYFLAG, Name) or (NONTERMFLAG, Name) of (TERMFLAG, Name)
#  and a reduction function which takes a list of the same size
#  as the BodyList (consisting of the results of the evaluations of
#  the previous reductions)
#  and returns an interpretation for the body

# the following function is used as a default reduction function
# for rules
def DefaultReductFun( RuleResultsList, Context ):
   if WARNONDEFAULTS:
      print "warn: default reduction."
      print "   ", RuleResultsList
   return RuleResultsList

class ParseRule:

  def __init__(self, goalNonTerm, BodyList, \
         ReductFunction = DefaultReductFun):
     #print BodyList
     # check some of the arguments (very limited!)
     if len(goalNonTerm) != 2 or goalNonTerm[0] != NONTERMFLAG:
        raise TypeError, "goal of rule must be nonterminal"
     for m in BodyList:
        #print m
        if len(m) != 2:
           raise TypeError, "invalid body form for rule"
     self.Nonterm = goalNonTerm
     self.Body = BodyList
     self.ReductFun = ReductFunction

  # for dumping/reconstruction: LOSES THE INTERPRETATION FUNCTION!
  def __repr__(self):
     return THISMODULE + ".ParseRule" + `self.components()`

  # marshal-able components of a rule
  def components(self):
     return (self.Nonterm, self.Body)

  # rule.reduce(Stack) pops of the stack elements corresponding
  # to the body of the rule and prepares the appropriate reduction
  # object for evaluation (or not) at higher levels
  #
  def reduce(self, Stack, Context=None):
     #print "reducing", Stack
     Blength = len(self.Body)
     #print Blength, len(self.Body)
     # pop off previous results from stack corresponding to body
     BodyResults = [None] * Blength
     #BodyNames = [None] * Blength # for debug
     #print "popping: "
     for i in range(1,Blength+1):
        Bindex = Blength - i  # stack contents pop off in reverse order

        # get and destructure the rule body entry
        RuleEntry = self.Body[Bindex]
        ( REkind , REname ) = RuleEntry

        # get and destructure the stack entry
        PoppedValue = Stack[-i] #Stack.Top()
        #print PoppedValue,
        #del Stack[-1]# = Stack[-1]-1 #Stack.Pop()
        SETokVal = PoppedValue[1]
        SEvalue = SETokVal[1]
        SEname = SETokVal[0][1]

        # the names from rule and stack must match (?)
        if SEname != REname:
           print SEname, REname
           print self
           raise ReductError, " token names don't match"

        # store the values for the reduction
        BodyResults[Bindex] = SEvalue
        #BodyNames[Bindex] = SEname # debug
     #endfor
     del Stack[len(Stack)-Blength:]
     #print "reduced", Stack
     #print
     # evaluate the reduction, in context
     reduct = self.ReductFun(BodyResults, Context)
     if WARNONDEFAULTS and self.ReductFun is DefaultReductFun:
        # should check whether name is defined before this...
        print "  default used on ", self.Name
     #Reduction( self.ReductFun, BodyResults, BodyNames )
     return (Stack, reduct)
  #enddef ParseRule.reduce

#endclass ParseRule

# for debugging: look through a rule list
#  and print names of rules that have default binding
#
def PrintDefaultBindings(rulelist):
    for r in rulelist:
        if r.ReductFun is DefaultReductFun:
           print r.Name

# the FSM class
#
class FSMachine:

  def __init__(self, rootNonTerm):

    # start and success state conventions
    startState=1
    successState=0

    self.root_nonTerminal = rootNonTerm
    self.initial_state = startState
    self.successful_final_state = successState

    # the list of states of the FSM, implemented as a dictionary
    #  entries are identified by their index
    #  content is 
    #  a list whose first elt is either TRANSFLAG, or TERMFLAG
    #  other list elts may be added by other layers (parse generator)
    #  indicating the kind of the state.
    self.States = {}

    # allocate start and success states
    self.States[startState]=[TRANSFLAG]
    self.States[successState]=[TERMFLAG]

    # the most recently allocated state
    self.maxState= startState

    # the map of current token+state number to next state
    #with entries of form (tokenname,state):nextstate_sequence
    #
    self.StateTokenMap = {}
  #enddef FSM()

  # ForbiddenMark is for filtering out maps to an error state
  def DUMP(self, DumpMapData=1, DumpStateData=1, ForbiddenMark={}):
    print "root nonterminal is ", self.root_nonTerminal
    print "start at ", self.initial_state
    print "end at ", self.successful_final_state
    print "number of states: ", self.maxState
    if DumpStateData:
       print
       for State in range(0,self.maxState+1):
          Data = self.States[State]
          print State, ": ", Data
    if DumpMapData:
       print
       for key in self.StateTokenMap.keys():
          map = self.StateTokenMap[key]
          if map[0][0] == MOVETOFLAG:
             ToStateData = self.States[map[0][1]]
             if len(ToStateData) < 2:
                Mark = None
             else:
                Mark = ToStateData[1]
             if Mark != ForbiddenMark:
                print key, " > ", map, " = ", ToStateData
          else:
             print key, " > reduction to rule number ", map[0][1]

  # what tokens does a state expect?
  def Expects(self, State):
    keys = self.StateTokenMap.keys()
    Tokens = kjSet.NewSet( [] )
    for (state1,token) in keys:
       if State == state1:
          kjSet.addMember(token,Tokens)
    return kjSet.get_elts(Tokens)

  # "allocate" a new state of specified kind
  #   kind must either be TRANSFLAG, TERMFLAG or a rule object
  # returns the number of the new state
  def NewState(self, kind, AdditionalInfo = []):
    if not kind in (TRANSFLAG,TERMFLAG,REDUCEFLAG):
       raise TypeError, "unknown state kind"
    available = self.maxState+1

    self.States[available] = [kind] + AdditionalInfo
    self.maxState = available
    return available

  # Install a reduction transition in the FSM:
  # a reduction is represented by mapping to a rule index
  # no nondeterminism is allowed.
  def SetReduction(self, fromState, TokenRep, Rulenum):
    key = (fromState, TokenRep)
    if not self.StateTokenMap.has_key(key):
       self.StateTokenMap[ key ] = ((REDUCEFLAG, Rulenum),)
    else:
       raise ReductError, "attempt to set ambiguous reduction"

  # Install a "shift" or "goto transition in the FSM:
  # supports nondeterminism by storing a sequence of possible transitions
  #
  def SetMap(self, fromState, TokenRep, toState): 
    key = (fromState, TokenRep)
    if self.StateTokenMap.has_key(key):
       Old = self.StateTokenMap[key]
       if Old[0][0] != MOVETOFLAG:
          # if the old value was not an integer, not a "normal state":
          # complain:
          raise NondetError, \
            "attempt to make inappropriate transition ambiguous"
       self.StateTokenMap[ key ] = Old + ((MOVETOFLAG,toState),)
    else:
       self.StateTokenMap[ key ] = ((MOVETOFLAG,toState),)

  # Find the action indicated by fsm on 
  #  (current_state, current_token) input.
  #
  # note: in the event of nondeterministic choice this chooses
  #   the first possibility listed.
  # ParseObj.DoOneReduction() currently uses the internal structure
  #  of StateTokenMap directly, rather than using this function.
  #
  def map(self, current_state, current_token):
    StateEntry = self.States[current_state][0]
    if StateEntry == TERMFLAG:
       return (TERMFLAG, 0)
    elif StateEntry == TRANSFLAG:
       # try to find a transition for this token and state
       key = (current_state, current_token)
       try:
          TMap = self.StateTokenMap[key]
          #print "TMap ", TMap
          #print "key ", key
          #print
          return TMap[0]
       except KeyError:
          return (NOMATCHFLAG, 0)
    else:
       raise FlowError, "unexpected else (2)"
  #enddef map

#endclass FSMachine

# the grammar class:
#   a grammar consists of 
#     - a LexDict lexical dictionary;
#     - a deterministic FSMachine;
#     - a Rulelist
#   and optionally a dictionary that maps Rulenames
#   to Rulelist indices (used for dumping and externally)
#   
class Grammar:

   def __init__(self, LexD, DFA, RuleL, RuleNameDict = None):
      # for auto initialization set LexD,DFA,RuleL to None
      if LexD == None and DFA == None and RuleL == None:
         self.LexD = LexDictionary()
         # use a dummy root nonterminal -- must fix elsewhere!
         self.DFA = FSMachine("ERROR")
         self.RuleL = []
      else:
         self.LexD = LexD
         self.DFA = DFA
         self.RuleL = RuleL
      if RuleNameDict != None:
         self.AddNameDict(RuleNameDict)
      self.CleanUp()
   #enddef __init__

   # look for default bindings
   def PrintDefaults(self):
       print "Default bindings on:"
       PrintDefaultBindings(self.RuleL)

   # setting case sensitivity: must happen before keyword installation
   # in LexD.
   def SetCaseSensitivity( self, Boolean ):
      self.LexD.SetCaseSensitivity( Boolean )

   # this may be silly, but to save some space in construction
   # a token dictionary may be used that facilitates sharing of
   # token representations.  This method either initializes
   # the dictionary or disposes of it if it exists
   def CleanUp(self):
      self.IndexToToken = {}
      # this dictionary is used by automatically
      # generated grammars to determine whether
      # a string represents a nonterminal
      self.NonTermDict = {}
      # similarly for terminals
      self.TermDict = {}
      # this string may be used to keep a printable
      # representation of the rules of the grammar
      # (usually in automatic grammar generation
      self.RuleString = ""

   # to associate a token to an integer use
   # self.IndexToToken[int] = tokenrep

   # this method associates rules to names using a
   # RuleNameDict dictionary which maps names to rule indices.
   # after invocation
   #   self.RuleNameToIndex[ name ] gives the index
   #     in self.RuleL for the rule associated with name, and
   #   self.RuleL[index].Name gives the name associated
   #     with the rule self.RuleL[index]
   #
   def AddNameDict(self, RuleNameDict):
     self.RuleNameToIndex = RuleNameDict
     # add a Name attribute to the rules of the rule list
     for ruleName in RuleNameDict.keys():
        index = RuleNameDict[ ruleName ]
        self.RuleL[ index ].Name = ruleName

   # parse a string using the grammar, return result and context
   def DoParse( self, String, Context = None, DoReductions = 1 ):
     # construct the ParserObj
     Stream = LexStringWalker( String, self.LexD )
     Stack = [] # {-1:0} #Walkers.SimpleStack()
     ParseOb = ParserObj( self.RuleL, Stream, self.DFA, Stack, \
                         DoReductions, Context )
     # do the parse
     ParseResult = ParseOb.GO()
     # return final result of reduction and the context
     return (ParseResult[1], Context)
   #enddef DoParse

   # parse a string using the grammar, but only return
   # the result of the last reduction, without the context
   def DoParse1( self, String, Context=None, DoReductions=1 ):
       return self.DoParse(String, Context, DoReductions)[0]

   # if the Name dictionary has been initialized
   # this method will (re)bind a reduction function to
   # a rule associated with Rulename      
   #
   def Bind( self, Rulename, NewFunction ):
     ruleindex = self.RuleNameToIndex[ Rulename ]
     rule = self.RuleL[ ruleindex ]
     rule.ReductFun = NewFunction
   #enddef Bind

   # bind a terminal to a regular expression and interp function
   # in the lexical dictionary (convenience)
   def Addterm( self, termname, regexpstr, funct ):
     self.TermDict[ termname ] =\
         self.LexD.terminal( termname, regexpstr, funct )
#endclass Grammar

# function to create a "null grammar"
def NullGrammar():
   return Grammar(None,None,None,{})

# unmarshalling a marshalled grammar created by
#   buildmodule.CGrammar.MarshalDump(Tofile)
#   tightly coupled with buildmodule code...
# file should be open and "pointing to" the marshalled rep.
#
# warning: doesn't bind semantics!
#
def UnMarshalGram(file):
    Grammar = NullGrammar()
    UnMarshal = UnMarshaller(file, Grammar)
    UnMarshal.MakeLex()
    UnMarshal.MakeRules()
    UnMarshal.MakeTransitions()
    UnMarshal.Cleanup()
    return UnMarshal.Gram

# unmarshalling object for unmarshalling grammar from a file
#
class UnMarshaller:

   def __init__(self, file, Grammar):
      import marshal
      self.Gram = Grammar
      BigList = marshal.load(file)
      if type(BigList) != type([]):
         raise FlowError, "bad type for unmarshalled list"
      if len(BigList) != 9:
         raise FlowError, "unmarshalled list of wrong size"
      self.tokens = BigList[0]
      self.punct = BigList[1]
      self.comments = BigList[2]
      self.RuleTups = BigList[3]
      self.MaxStates = BigList[4]
      self.reducts = BigList[5]
      self.moveTos = BigList[6]
      self.Root = BigList[7]
      self.CaseSensitivity = BigList[8]
      
      Grammar.SetCaseSensitivity( self.CaseSensitivity )

   def MakeLex(self):
      Grammar=self.Gram
      LexD = Grammar.LexD
      # punctuations
      LexD.punctuationlist = self.punct
      # comments
      for commentregex in self.comments:
          LexD.comment(commentregex)
      #LexD.commentstring = self.comments
      # keywords, terminals, nonterms
      #   rewrite the tokens list for sharing and extra safety
      LexTokens = {}
      tokens = self.tokens
      for tokenindex in range(len(tokens)):
          (kind,name) = tokens[tokenindex]
          if kind == KEYFLAG:
             tokens[tokenindex] = LexD.keyword(name)
          elif not kind in [TERMFLAG, NONTERMFLAG]:
             raise FlowError, "unknown token type"
      # not needed
      self.tokens = tokens

   def MakeRules(self):
      Grammar = self.Gram
      Grammar.DFA.root_nonTerminal = self.Root
      NameIndex = Grammar.RuleNameToIndex
      RuleTuples = self.RuleTups
      nRules = len(RuleTuples)
      RuleList = [None] * nRules
      for index in range(nRules):
         (Name, Components) = RuleTuples[index]
         rule = apply(ParseRule, Components)
         rule.Name = Name
         RuleList[index] = rule
         NameIndex[Name] = index
      Grammar.RuleL = RuleList

   def MakeTransitions(self):
      Grammar = self.Gram
      DFA = Grammar.DFA
      StateTokenMap = DFA.StateTokenMap
      tokens = self.tokens
      # record the state number
      DFA.maxState = self.MaxStates
      # this is historical, unfortunately...  CLEAN IT UP SOMEDAY!
      # THE DFA.States DICT IS NOT NEEDED (?) (here)
      for state in range(1, self.MaxStates+1):
         DFA.States[state] = [TRANSFLAG]
      # record the reductions
      for (fromState, TokenIndex, rulenum) in self.reducts:
         DFA.SetReduction(fromState, tokens[TokenIndex], rulenum)
      # record the transitions
      for (fromState, TokenIndex, ToState) in self.moveTos:
         DFA.SetMap(fromState, tokens[TokenIndex], ToState)

   def Cleanup(self):
      Grammar = self.Gram
      Grammar.CleanUp()

################# FOLLOWING CODE IS FOR REGRESSION TESTING ONLY
################# DELETE IT IF YOU WANT/NEED
#### All tests for this module deleted, since
#### ParseBuild module tests are sufficient.  

Generated by  Doxygen 1.6.0   Back to index