__version__ = "Ka-Ping Yee, 26 October 1997; patched, GvR 3/30/98'The module provides tokenisation of python source code.
The module provides a class 'python_tokenize' and a function 'tokenize'.
The function tokenize is provided for compatibility with the original tokenize.py. It accepts up to four arguments. The first argument, readline, is required and is a callback function which fetches a line for tokenisation. It should return a line with a trailing newline character, or an empty string to indicate end of input. The second argument, tokeneater, is a callback which is called with each token as an argument. If omitted, it defaults to a pretty-printing routine which writes a formatted display of the token to sys.stdout.
The class constructor and function accept two optional arguments. The argument squashop defaults to 0 for the class and 1 for the function. If set, all special tokens are reported as token OP. The argument report_comments defaults to 0 for the constructor and 1 for the function. If set, comments are reported as COMMENTS, and blanks lines and mid-statement end of lines are reported as NL.
If squashop and report_comments are both zero, the result is 'pure' token stream suitable for parsing, if both are set the result is more suitable for pretty printing.
The class provides the following methods. The method reset() resets the tokenizer state. The method write accepts arbitrary text data. The method writeline shall be called with a single line including a trailing newline character, or with an empty string, indicating end of input. The method get_tokens is called to fetch tokens which have been produced and clears the token queue. The method close signals end of input and returns any trailing tokens. The method tokenize accepts any text data and returns the tokens from the queue. Tokens which span lines are report after the line in which they are terminated is processed.
The format of a token consists of an integer token index corresponding to python tokens as listed in the file token.py, the lexeme which the token represents, the starting and ending positions of the lexeme as (line, column) pairs, and the source containing the lexeme. Lines are numbered from 1.
1: #line 49 "iscrtkpy.ipk" 2: __version__ = "Ka-Ping Yee 1997/10/26; GvR 1998/3/20, Skaller 1998/7/24" 3: 4: import string, re 5: from token import * 6: 7: COMMENT = N_TOKENS 8: tok_name[COMMENT] = 'COMMENT' 9: NL = N_TOKENS + 1 10: tok_name[NL] = 'NL' 11: WHITESPACE = N_TOKENS+2 12: tok_name[WHITESPACE] = 'WHITESPACE' 13: 14: 15: # Changes from 1.3: 16: # Ignore now accepts \f as whitespace. Operator now includes '**'. 17: # Ignore and Special now accept \n or \r\n at the end of a line. 18: # Imagnumber is new. Expfloat is corrected to reject '0e4'. 19: # Note: to quote a backslash in a regex, it must be doubled in a r'aw' string. 20: 21: def group(*choices): return '(' + string.join(choices, '|') + ')' 22: def any(*choices): return apply(group, choices) + '*' 23: def maybe(*choices): return apply(group, choices) + '?' 24: 25: Whitespace = r'[ \f\t]*' 26: Comment = r'#[^\r\n]*' 27: Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment) 28: Name = r'[a-zA-Z_]\w*' 29: 30: Hexnumber = r'0[xX][\da-fA-F]*[lL]?' 31: Octnumber = r'0[0-7]*[lL]?' 32: Decnumber = r'[1-9]\d*[lL]?' 33: Intnumber = group(Hexnumber, Octnumber, Decnumber) 34: Exponent = r'[eE][-+]?\d+' 35: Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent) 36: Expfloat = r'[1-9]\d*' + Exponent 37: Floatnumber = group(Pointfloat, Expfloat) 38: Imagnumber = group(r'0[jJ]', r'[1-9]\d*[jJ]', Floatnumber + r'[jJ]') 39: Number = group(Imagnumber, Floatnumber, Intnumber) 40: 41: Single = any(r"[^'\\]", r'\\.') + "'" 42: Double = any(r'[^"\\]', r'\\.') + '"' 43: Single3 = any(r"[^'\\]",r'\\.',r"'[^'\\]",r"'\\.",r"''[^'\\]",r"''\\.") + "'''" 44: Double3 = any(r'[^"\\]',r'\\.',r'"[^"\\]',r'"\\.',r'""[^"\\]',r'""\\.') + '"""' 45: Triple = group("[rR]?'''", '[rR]?"""') 46: String = group("[rR]?'" + any(r"[^\n'\\]", r'\\.') + "'", 47: '[rR]?"' + any(r'[^\n"\\]', r'\\.') + '"') 48: 49: Operator = group('\+', '\-', '\*\*', '\*', '\^', '~', '/', '%', '&', '\|', 50: '<<', '>>', '==', '<=', '<>', '!=', '>=', '=', '<', '>') 51: Bracket = '[][(){}]' 52: Special = group(r'\r?\n', r'[:;.,`]') 53: Funny = group(Operator, Bracket, Special) 54: 55: PlainToken = group(Number, Funny, String, Name) 56: Token = Ignore + PlainToken 57: 58: ContStr = group("[rR]?'" + any(r'\\.', r"[^\n'\\]") + group("'", r'\\\r?\n'), 59: '[rR]?"' + any(r'\\.', r'[^\n"\\]') + group('"', r'\\\r?\n')) 60: PseudoExtras = group(r'\\\r?\n', Comment, Triple) 61: PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name) 62: 63: tokenprog, pseudoprog, single3prog, double3prog = map( 64: re.compile, (Token, PseudoToken, Single3, Double3)) 65: endprogs = {"'": re.compile(Single), '"': re.compile(Double), 66: "'''": single3prog, '"""': double3prog, 67: "r'''": single3prog, 'r"""': double3prog, 68: "R'''": single3prog, 'R"""': double3prog, 'r': None, 'R': None} 69: 70: opdict = { 71: '(':LPAR, 72: ')':RPAR, 73: '[':LSQB, 74: ']':RSQB, 75: ':':COLON, 76: ',':COMMA, 77: ';':SEMI, 78: '+':PLUS, 79: '-':MINUS, 80: '*':STAR, 81: '/':SLASH, 82: '|':VBAR, 83: '&':AMPER, 84: '<':LESS, 85: '>':GREATER, 86: '=':EQUAL, 87: '.':DOT, 88: '%':PERCENT, 89: '`':BACKQUOTE, 90: '{':LBRACE, 91: '}':RBRACE, 92: '==':EQEQUAL, 93: '!=':NOTEQUAL, 94: '<>':NOTEQUAL, 95: '<=':LESSEQUAL, 96: '>=':GREATEREQUAL, 97: '~':TILDE, 98: '^':CIRCUMFLEX, 99: '<<':LEFTSHIFT, 100: '>>':RIGHTSHIFT, 101: '**':DOUBLESTAR 102: } 103: 104: tabsize = 8 105: TokenError = 'TokenError' 106: def printtoken(type, token, (srow, scol), (erow, ecol), line): # for testing 107: print "%d,%d-%d,%d:\t%s\t%s" % \ 108: (srow, scol, erow, ecol, tok_name[type], repr(token)) 109: 110: def tokenise(readline, tokeneater=printtoken,squashop=1, report_comments=1): 111: t = python_tokeniser(squashop, report_comments) 112: line = readline() 113: while line: 114: t.writeline(line) 115: for token in t.tokens: 116: apply(tokeneater,token) 117: t.tokens = [] 118: line = readline() 119: t.writeline('') 120: for token in t.tokens: 121: apply(tokeneater,token) 122: t.tokens = [] 123: 124: namechars, numchars = string.letters + '_', string.digits 125: 126: class python_tokeniser: 127: def __init__(self, squashop=0, report_comments=0): 128: self.squashop = squashop 129: self.report_comments = report_comments 130: self.reset() 131: 132: def reset(self): 133: self.lnum = self.parenlev = self.continued = 0 134: self.contstr, self.needcont = '', 0 135: self.contline = None 136: self.indents = [0] 137: self.tokens = [] 138: self.buffer = '' 139: 140: def get_tokens(self): 141: tmp = self.tokens 142: self.tokens = [] 143: return tmp 144: 145: def tokenize(self,data): 146: self.write(data) 147: return self.get_tokens() 148: 149: def tokeneater(self,*args): 150: self.tokens.append(args) 151: 152: def close(self): 153: if self.buffer: 154: self.writeline(self.buffer) 155: self.buffer = '' 156: self.writeline('') 157: return self.get_tokens() 158: 159: def write(self,data): 160: lines = string.split(data,'\n') 161: if lines: 162: lines[0]=lines[0]+self.buffer 163: self.buffer = '' 164: for line in lines[:-1]: 165: self.writeline(line+'\n') 166: self.buffer = lines[-1] 167: 168: def writeline(self,line): 169: lnum = self.lnum = self.lnum + 1 170: pos, max = 0, len(line) 171: tokeneater = self.tokeneater 172: 173: if self.contstr: # continued string 174: if not line: 175: raise TokenError, ("EOF in multi-line string", self.strstart) 176: endmatch = self.endprog.match(line) 177: if endmatch: 178: pos = end = endmatch.end(0) 179: tokeneater(STRING, self.contstr + line[:end], 180: self.strstart, (lnum, end), self.contline + line) 181: self.contstr, self.needcont = '', 0 182: self.contline = None 183: elif self.needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n': 184: tokeneater(ERRORTOKEN, self.contstr + line, 185: self.strstart, (lnum, len(line)), self.contline) 186: self.contstr = '' 187: self.contline = None 188: return 189: else: 190: self.contstr = self.contstr + line 191: self.contline = self.contline + line 192: return 193: 194: elif self.parenlev == 0 and not self.continued: # new statement 195: if not line: self._close(); return 196: 197: column = 0 198: while pos < max: # measure leading whitespace 199: if line[pos] == ' ': column = column + 1 200: elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize 201: elif line[pos] == '\f': column = 0 202: else: break 203: pos = pos + 1 204: if pos == max: self._close(); return # omitted newline 205: 206: if line[pos] in '#\r\n': # skip comments or blank lines 207: if self.report_comments: 208: tokeneater((NL, COMMENT)[line[pos] == '#'], line[pos:], 209: (lnum, pos), (lnum, len(line)), line) 210: return 211: 212: if column > self.indents[-1]: # count indents or dedents 213: self.indents.append(column) 214: tokeneater(INDENT, line[:pos], (lnum, 0), (lnum, pos), line) 215: while column < self.indents[-1]: 216: self.indents = self.indents[:-1] 217: tokeneater(DEDENT, '', (lnum, pos), (lnum, pos), line) 218: 219: else: # continued statement 220: if not line: 221: raise TokenError, ("EOF in multi-line statement", (lnum, 0)) 222: self.continued = 0 223: 224: while pos < max: 225: pseudomatch = pseudoprog.match(line, pos) 226: if pseudomatch: # scan for tokens 227: start, end = pseudomatch.span(1) 228: spos, epos, pos = (lnum, start), (lnum, end), end 229: token, initial = line[start:end], line[start] 230: 231: if initial in numchars \ 232: or (initial == '.' and token != '.'): # ordinary number 233: tokeneater(NUMBER, token, spos, epos, line) 234: elif initial in '\r\n': 235: if self.parenlev == 0: 236: tokeneater(NEWLINE, token, spos, epos, line) 237: elif self.report_comments: 238: tokeneater(NL, token, spos, epos, line) 239: 240: elif initial == '#': 241: if self.report_comments: 242: tokeneater(COMMENT, token, spos, epos, line) 243: elif token in ("'''", '"""', # triple-quoted 244: "r'''", 'r"""', "R'''", 'R"""'): 245: self.endprog = endprogs[token] 246: endmatch = self.endprog.match(line, pos) 247: if endmatch: # all on one line 248: pos = endmatch.end(0) 249: token = line[start:pos] 250: tokeneater(STRING, token, spos, (lnum, pos), line) 251: else: 252: self.strstart = (lnum, start) # multiple lines 253: self.contstr = line[start:] 254: self.contline = line 255: break 256: elif initial in ("'", '"') or \ 257: token[:2] in ("r'", 'r"', "R'", 'R"'): 258: if token[-1] == '\n': # continued string 259: self.strstart = (lnum, start) 260: self.endprog = endprogs[initial] or endprogs[token[1]] 261: self.contstr, self.needcont = line[start:], 1 262: self.contline = line 263: break 264: else: # ordinary string 265: tokeneater(STRING, token, spos, epos, line) 266: elif initial in namechars: # ordinary name 267: tokeneater(NAME, token, spos, epos, line) 268: elif initial == '\\': # continued stmt 269: self.continued = 1 270: else: 271: if initial in '([{': self.parenlev = self.parenlev + 1 272: elif initial in ')]}': self.parenlev = self.parenlev - 1 273: if self.squashop: 274: tokeneater(OP, token, spos, epos, line) 275: else: 276: op = opdict[token] 277: tokeneater(op, token, spos, epos, line) 278: else: 279: tokeneater(ERRORTOKEN, line[pos], 280: (lnum, pos), (lnum, pos+1), line) 281: pos = pos + 1 282: 283: 284: def _close(self): 285: for indent in self.indents[1:]: # pop remaining indent levels 286: self.tokeneater(DEDENT, '', (self.lnum, 0), (self.lnum, 0), '') 287: self.tokeneater(ENDMARKER, '', (self.lnum, 0), (self.lnum, 0), '') 288: 289: if __name__ == '__main__': # testing 290: import sys 291: if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline) 292: else: tokenize(sys.stdin.readline) 293: