6.7.1. Python Tokeniser

6.7.1. Python Tokeniser

This module was modified from tokenize.py of the standard library marked
  __version__ = "Ka-Ping Yee, 26 October 1997; patched, GvR 3/30/98'
The module provides tokenisation of python source code.

The module provides a class 'python_tokenize' and a function 'tokenize'.

The function tokenize is provided for compatibility with the original tokenize.py. It accepts up to four arguments. The first argument, readline, is required and is a callback function which fetches a line for tokenisation. It should return a line with a trailing newline character, or an empty string to indicate end of input. The second argument, tokeneater, is a callback which is called with each token as an argument. If omitted, it defaults to a pretty-printing routine which writes a formatted display of the token to sys.stdout.

The class constructor and function accept two optional arguments. The argument squashop defaults to 0 for the class and 1 for the function. If set, all special tokens are reported as token OP. The argument report_comments defaults to 0 for the constructor and 1 for the function. If set, comments are reported as COMMENTS, and blanks lines and mid-statement end of lines are reported as NL.

If squashop and report_comments are both zero, the result is 'pure' token stream suitable for parsing, if both are set the result is more suitable for pretty printing.

The class provides the following methods. The method reset() resets the tokenizer state. The method write accepts arbitrary text data. The method writeline shall be called with a single line including a trailing newline character, or with an empty string, indicating end of input. The method get_tokens is called to fetch tokens which have been produced and clears the token queue. The method close signals end of input and returns any trailing tokens. The method tokenize accepts any text data and returns the tokens from the queue. Tokens which span lines are report after the line in which they are terminated is processed.

The format of a token consists of an integer token index corresponding to python tokens as listed in the file token.py, the lexeme which the token represents, the starting and ending positions of the lexeme as (line, column) pairs, and the source containing the lexeme. Lines are numbered from 1.

Start python section to interscript/tokenisers/python.py[1]
     1: #line 49 "iscrtkpy.ipk"
     2: __version__ = "Ka-Ping Yee 1997/10/26; GvR 1998/3/20, Skaller 1998/7/24"
     3: 
     4: import string, re
     5: from token import *
     6: 
     7: COMMENT = N_TOKENS
     8: tok_name[COMMENT] = 'COMMENT'
     9: NL = N_TOKENS + 1
    10: tok_name[NL] = 'NL'
    11: WHITESPACE = N_TOKENS+2
    12: tok_name[WHITESPACE] = 'WHITESPACE'
    13: 
    14: 
    15: # Changes from 1.3:
    16: #     Ignore now accepts \f as whitespace.  Operator now includes '**'.
    17: #     Ignore and Special now accept \n or \r\n at the end of a line.
    18: #     Imagnumber is new.  Expfloat is corrected to reject '0e4'.
    19: # Note: to quote a backslash in a regex, it must be doubled in a r'aw' string.
    20: 
    21: def group(*choices): return '(' + string.join(choices, '|') + ')'
    22: def any(*choices): return apply(group, choices) + '*'
    23: def maybe(*choices): return apply(group, choices) + '?'
    24: 
    25: Whitespace = r'[ \f\t]*'
    26: Comment = r'#[^\r\n]*'
    27: Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
    28: Name = r'[a-zA-Z_]\w*'
    29: 
    30: Hexnumber = r'0[xX][\da-fA-F]*[lL]?'
    31: Octnumber = r'0[0-7]*[lL]?'
    32: Decnumber = r'[1-9]\d*[lL]?'
    33: Intnumber = group(Hexnumber, Octnumber, Decnumber)
    34: Exponent = r'[eE][-+]?\d+'
    35: Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
    36: Expfloat = r'[1-9]\d*' + Exponent
    37: Floatnumber = group(Pointfloat, Expfloat)
    38: Imagnumber = group(r'0[jJ]', r'[1-9]\d*[jJ]', Floatnumber + r'[jJ]')
    39: Number = group(Imagnumber, Floatnumber, Intnumber)
    40: 
    41: Single = any(r"[^'\\]", r'\\.') + "'"
    42: Double = any(r'[^"\\]', r'\\.') + '"'
    43: Single3 = any(r"[^'\\]",r'\\.',r"'[^'\\]",r"'\\.",r"''[^'\\]",r"''\\.") + "'''"
    44: Double3 = any(r'[^"\\]',r'\\.',r'"[^"\\]',r'"\\.',r'""[^"\\]',r'""\\.') + '"""'
    45: Triple = group("[rR]?'''", '[rR]?"""')
    46: String = group("[rR]?'" + any(r"[^\n'\\]", r'\\.') + "'",
    47:                '[rR]?"' + any(r'[^\n"\\]', r'\\.') + '"')
    48: 
    49: Operator = group('\+', '\-', '\*\*', '\*', '\^', '~', '/', '%', '&', '\|',
    50:                  '<<', '>>', '==', '<=', '<>', '!=', '>=', '=', '<', '>')
    51: Bracket = '[][(){}]'
    52: Special = group(r'\r?\n', r'[:;.,`]')
    53: Funny = group(Operator, Bracket, Special)
    54: 
    55: PlainToken = group(Number, Funny, String, Name)
    56: Token = Ignore + PlainToken
    57: 
    58: ContStr = group("[rR]?'" + any(r'\\.', r"[^\n'\\]") + group("'", r'\\\r?\n'),
    59:                 '[rR]?"' + any(r'\\.', r'[^\n"\\]') + group('"', r'\\\r?\n'))
    60: PseudoExtras = group(r'\\\r?\n', Comment, Triple)
    61: PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
    62: 
    63: tokenprog, pseudoprog, single3prog, double3prog = map(
    64:     re.compile, (Token, PseudoToken, Single3, Double3))
    65: endprogs = {"'": re.compile(Single), '"': re.compile(Double),
    66:             "'''": single3prog, '"""': double3prog,
    67:             "r'''": single3prog, 'r"""': double3prog,
    68:             "R'''": single3prog, 'R"""': double3prog, 'r': None, 'R': None}
    69: 
    70: opdict = {
    71:   '(':LPAR,
    72:   ')':RPAR,
    73:   '[':LSQB,
    74:   ']':RSQB,
    75:   ':':COLON,
    76:   ',':COMMA,
    77:   ';':SEMI,
    78:   '+':PLUS,
    79:   '-':MINUS,
    80:   '*':STAR,
    81:   '/':SLASH,
    82:   '|':VBAR,
    83:   '&':AMPER,
    84:   '<':LESS,
    85:   '>':GREATER,
    86:   '=':EQUAL,
    87:   '.':DOT,
    88:   '%':PERCENT,
    89:   '`':BACKQUOTE,
    90:   '{':LBRACE,
    91:   '}':RBRACE,
    92:   '==':EQEQUAL,
    93:   '!=':NOTEQUAL,
    94:   '<>':NOTEQUAL,
    95:   '<=':LESSEQUAL,
    96:   '>=':GREATEREQUAL,
    97:   '~':TILDE,
    98:   '^':CIRCUMFLEX,
    99:   '<<':LEFTSHIFT,
   100:   '>>':RIGHTSHIFT,
   101:   '**':DOUBLESTAR
   102:   }
   103: 
   104: tabsize = 8
   105: TokenError = 'TokenError'
   106: def printtoken(type, token, (srow, scol), (erow, ecol), line): # for testing
   107:     print "%d,%d-%d,%d:\t%s\t%s" % \
   108:         (srow, scol, erow, ecol, tok_name[type], repr(token))
   109: 
   110: def tokenise(readline, tokeneater=printtoken,squashop=1, report_comments=1):
   111:   t = python_tokeniser(squashop, report_comments)
   112:   line = readline()
   113:   while line:
   114:     t.writeline(line)
   115:     for token in t.tokens:
   116:       apply(tokeneater,token)
   117:     t.tokens = []
   118:     line = readline()
   119:   t.writeline('')
   120:   for token in t.tokens:
   121:     apply(tokeneater,token)
   122:   t.tokens = []
   123: 
   124: namechars, numchars = string.letters + '_', string.digits
   125: 
   126: class python_tokeniser:
   127:   def __init__(self, squashop=0, report_comments=0):
   128:     self.squashop = squashop
   129:     self.report_comments = report_comments
   130:     self.reset()
   131: 
   132:   def reset(self):
   133:     self.lnum = self.parenlev = self.continued = 0
   134:     self.contstr, self.needcont = '', 0
   135:     self.contline = None
   136:     self.indents = [0]
   137:     self.tokens = []
   138:     self.buffer = ''
   139: 
   140:   def get_tokens(self):
   141:     tmp = self.tokens
   142:     self.tokens = []
   143:     return tmp
   144: 
   145:   def tokenize(self,data):
   146:     self.write(data)
   147:     return self.get_tokens()
   148: 
   149:   def tokeneater(self,*args):
   150:     self.tokens.append(args)
   151: 
   152:   def close(self):
   153:     if self.buffer:
   154:       self.writeline(self.buffer)
   155:       self.buffer = ''
   156:     self.writeline('')
   157:     return self.get_tokens()
   158: 
   159:   def write(self,data):
   160:     lines = string.split(data,'\n')
   161:     if lines:
   162:       lines[0]=lines[0]+self.buffer
   163:       self.buffer = ''
   164:     for line in lines[:-1]:
   165:       self.writeline(line+'\n')
   166:     self.buffer = lines[-1]
   167: 
   168:   def writeline(self,line):
   169:     lnum = self.lnum = self.lnum + 1
   170:     pos, max = 0, len(line)
   171:     tokeneater = self.tokeneater
   172: 
   173:     if self.contstr:                                   # continued string
   174:         if not line:
   175:             raise TokenError, ("EOF in multi-line string", self.strstart)
   176:         endmatch = self.endprog.match(line)
   177:         if endmatch:
   178:             pos = end = endmatch.end(0)
   179:             tokeneater(STRING, self.contstr + line[:end],
   180:                        self.strstart, (lnum, end), self.contline + line)
   181:             self.contstr, self.needcont = '', 0
   182:             self.contline = None
   183:         elif self.needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
   184:             tokeneater(ERRORTOKEN, self.contstr + line,
   185:                        self.strstart, (lnum, len(line)), self.contline)
   186:             self.contstr = ''
   187:             self.contline = None
   188:             return
   189:         else:
   190:             self.contstr = self.contstr + line
   191:             self.contline = self.contline + line
   192:             return
   193: 
   194:     elif self.parenlev == 0 and not self.continued:    # new statement
   195:         if not line: self._close(); return
   196: 
   197:         column = 0
   198:         while pos < max:                               # measure leading whitespace
   199:             if line[pos] == ' ': column = column + 1
   200:             elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize
   201:             elif line[pos] == '\f': column = 0
   202:             else: break
   203:             pos = pos + 1
   204:         if pos == max: self._close(); return           # omitted newline
   205: 
   206:         if line[pos] in '#\r\n':                       # skip comments or blank lines
   207:             if self.report_comments:
   208:               tokeneater((NL, COMMENT)[line[pos] == '#'], line[pos:],
   209:                        (lnum, pos), (lnum, len(line)), line)
   210:             return
   211: 
   212:         if column > self.indents[-1]:                  # count indents or dedents
   213:             self.indents.append(column)
   214:             tokeneater(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
   215:         while column < self.indents[-1]:
   216:             self.indents = self.indents[:-1]
   217:             tokeneater(DEDENT, '', (lnum, pos), (lnum, pos), line)
   218: 
   219:     else:                                              # continued statement
   220:         if not line:
   221:             raise TokenError, ("EOF in multi-line statement", (lnum, 0))
   222:         self.continued = 0
   223: 
   224:     while pos < max:
   225:         pseudomatch = pseudoprog.match(line, pos)
   226:         if pseudomatch:                                # scan for tokens
   227:             start, end = pseudomatch.span(1)
   228:             spos, epos, pos = (lnum, start), (lnum, end), end
   229:             token, initial = line[start:end], line[start]
   230: 
   231:             if initial in numchars \
   232:                 or (initial == '.' and token != '.'):  # ordinary number
   233:                 tokeneater(NUMBER, token, spos, epos, line)
   234:             elif initial in '\r\n':
   235:                 if self.parenlev == 0:
   236:                   tokeneater(NEWLINE, token, spos, epos, line)
   237:                 elif self.report_comments:
   238:                   tokeneater(NL, token, spos, epos, line)
   239: 
   240:             elif initial == '#':
   241:                 if self.report_comments:
   242:                   tokeneater(COMMENT, token, spos, epos, line)
   243:             elif token in ("'''", '"""',               # triple-quoted
   244:                            "r'''", 'r"""', "R'''", 'R"""'):
   245:                 self.endprog = endprogs[token]
   246:                 endmatch = self.endprog.match(line, pos)
   247:                 if endmatch:                           # all on one line
   248:                     pos = endmatch.end(0)
   249:                     token = line[start:pos]
   250:                     tokeneater(STRING, token, spos, (lnum, pos), line)
   251:                 else:
   252:                     self.strstart = (lnum, start)      # multiple lines
   253:                     self.contstr = line[start:]
   254:                     self.contline = line
   255:                     break
   256:             elif initial in ("'", '"') or \
   257:                 token[:2] in ("r'", 'r"', "R'", 'R"'):
   258:                 if token[-1] == '\n':                  # continued string
   259:                     self.strstart = (lnum, start)
   260:                     self.endprog = endprogs[initial] or endprogs[token[1]]
   261:                     self.contstr, self.needcont = line[start:], 1
   262:                     self.contline = line
   263:                     break
   264:                 else:                                  # ordinary string
   265:                     tokeneater(STRING, token, spos, epos, line)
   266:             elif initial in namechars:                 # ordinary name
   267:                 tokeneater(NAME, token, spos, epos, line)
   268:             elif initial == '\\':                      # continued stmt
   269:                 self.continued = 1
   270:             else:
   271:                 if initial in '([{': self.parenlev = self.parenlev + 1
   272:                 elif initial in ')]}': self.parenlev = self.parenlev - 1
   273:                 if self.squashop:
   274:                   tokeneater(OP, token, spos, epos, line)
   275:                 else:
   276:                   op = opdict[token]
   277:                   tokeneater(op, token, spos, epos, line)
   278:         else:
   279:             tokeneater(ERRORTOKEN, line[pos],
   280:                        (lnum, pos), (lnum, pos+1), line)
   281:             pos = pos + 1
   282: 
   283: 
   284:   def _close(self):
   285:       for indent in self.indents[1:]:          # pop remaining indent levels
   286:           self.tokeneater(DEDENT, '', (self.lnum, 0), (self.lnum, 0), '')
   287:       self.tokeneater(ENDMARKER, '', (self.lnum, 0), (self.lnum, 0), '')
   288: 
   289: if __name__ == '__main__':                     # testing
   290:     import sys
   291:     if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
   292:     else: tokenize(sys.stdin.readline)
   293: 
End python section to interscript/tokenisers/python.py[1]