python/html/tokenize_8py_source.html

"""Tokenization help for Python programs.


generate_tokens(readline) is a generator that breaks a stream of

text into Python tokens.  It accepts a readline-like method which is called

repeatedly to get the next line of input (or "" for EOF).  It generates

5-tuples with these members:


    the token type (see token.py)

    the token (a string)

    the starting (row, column) indices of the token (a 2-tuple of ints)

    the ending (row, column) indices of the token (a 2-tuple of ints)

    the original line (string)


It is designed to match the working of the Python tokenizer exactly, except

that it produces COMMENT tokens for comments and gives type OP for all

operators


Older entry points

    tokenize_loop(readline, tokeneater)

    tokenize(readline, tokeneater=printtoken)

are the same, except instead of generating tokens, tokeneater is a callback

function to which the 5 fields described above are passed as 5 arguments,

each time a new token is found."""


from __future__ import generators


__author__ = 'Ka-Ping Yee <ping@lfw.org>'

__credits__ = \

    'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'


import string, re

from token import *


import token

__all__ = [x for x in dir(token) if x[0] != '_'] + ["COMMENT", "tokenize", "NL"]

del token


COMMENT = N_TOKENS

tok_name[COMMENT] = 'COMMENT'

NL = N_TOKENS + 1

tok_name[NL] = 'NL'

N_TOKENS += 2


def group(*choices): return '(' + '|'.join(choices) + ')'

def any(*choices): return apply(group, choices) + '*'

def maybe(*choices): return apply(group, choices) + '?'


Whitespace = r'[ \f\t]*'

Comment = r'#[^\r\n]*'

Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)

Name = r'[a-zA-Z_]\w*'


Hexnumber = r'0[xX][\da-fA-F]*[lL]?'

Octnumber = r'0[0-7]*[lL]?'

Decnumber = r'[1-9]\d*[lL]?'

Intnumber = group(Hexnumber, Octnumber, Decnumber)

Exponent = r'[eE][-+]?\d+'

Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)

Expfloat = r'\d+' + Exponent

Floatnumber = group(Pointfloat, Expfloat)

Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')

Number = group(Imagnumber, Floatnumber, Intnumber)


# Tail end of ' string.

Single = r"[^'\\]*(?:\\.[^'\\]*)*'"

# Tail end of " string.

Double = r'[^"\\]*(?:\\.[^"\\]*)*"'

# Tail end of ''' string.

Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"

# Tail end of """ string.

Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'

Triple = group("[uU]?[rR]?'''", '[uU]?[rR]?"""')

# Single-line ' or " string.

String = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",

               r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')


# Because of leftmost-then-longest match semantics, be sure to put the

# longest operators first (e.g., if = came before ==, == would get

# recognized as two instances of =).

Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",

                 r"//=?",

                 r"[+\-*/%&|^=<>]=?",

                 r"~")


Bracket = '[][(){}]'

Special = group(r'\r?\n', r'[:;.,`]')

Funny = group(Operator, Bracket, Special)


PlainToken = group(Number, Funny, String, Name)

Token = Ignore + PlainToken


# First (or only) line of ' or " string.

ContStr = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +

                group("'", r'\\\r?\n'),

                r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +

                group('"', r'\\\r?\n'))

PseudoExtras = group(r'\\\r?\n', Comment, Triple)

PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)


tokenprog, pseudoprog, single3prog, double3prog = map(

    re.compile, (Token, PseudoToken, Single3, Double3))

endprogs = {"'": re.compile(Single), '"': re.compile(Double),

            "'''": single3prog, '"""': double3prog,

            "r'''": single3prog, 'r"""': double3prog,

            "u'''": single3prog, 'u"""': double3prog,

            "ur'''": single3prog, 'ur"""': double3prog,

            "R'''": single3prog, 'R"""': double3prog,

            "U'''": single3prog, 'U"""': double3prog,

            "uR'''": single3prog, 'uR"""': double3prog,

            "Ur'''": single3prog, 'Ur"""': double3prog,

            "UR'''": single3prog, 'UR"""': double3prog,

            'r': None, 'R': None, 'u': None, 'U': None}


tabsize = 8


class TokenError(Exception): pass


class StopTokenizing(Exception): pass


def printtoken(type, token, (srow, scol), (erow, ecol), line): # for testing

    print "%d,%d-%d,%d:\t%s\t%s" % \

        (srow, scol, erow, ecol, tok_name[type], repr(token))


def tokenize(readline, tokeneater=printtoken):

    try:

        tokenize_loop(readline, tokeneater)

    except StopTokenizing:

        pass


# backwards compatible interface

def tokenize_loop(readline, tokeneater):

    for token_info in generate_tokens(readline):

        apply(tokeneater, token_info)


def generate_tokens(readline):

    lnum = parenlev = continued = 0

    namechars, numchars = string.ascii_letters + '_', '0123456789'

    contstr, needcont = '', 0

    contline = None

    indents = [0]


    while 1:                                   # loop over lines in stream

        line = readline()

        lnum = lnum + 1

        pos, max = 0, len(line)


        if contstr:                            # continued string

            if not line:

                raise TokenError, ("EOF in multi-line string", strstart)

            endmatch = endprog.match(line)

            if endmatch:

                pos = end = endmatch.end(0)

                yield (STRING, contstr + line[:end],

                           strstart, (lnum, end), contline + line)

                contstr, needcont = '', 0

                contline = None

            elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':

                yield (ERRORTOKEN, contstr + line,

                           strstart, (lnum, len(line)), contline)

                contstr = ''

                contline = None

                continue

            else:

                contstr = contstr + line

                contline = contline + line

                continue


        elif parenlev == 0 and not continued:  # new statement

            if not line: break

            column = 0

            while pos < max:                   # measure leading whitespace

                if line[pos] == ' ': column = column + 1

                elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize

                elif line[pos] == '\f': column = 0

                else: break

                pos = pos + 1

            if pos == max: break


            if line[pos] in '#\r\n':           # skip comments or blank lines

                yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],

                           (lnum, pos), (lnum, len(line)), line)

                continue


            if column > indents[-1]:           # count indents or dedents

                indents.append(column)

                yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)

            while column < indents[-1]:

                indents = indents[:-1]

                yield (DEDENT, '', (lnum, pos), (lnum, pos), line)


        else:                                  # continued statement

            if not line:

                raise TokenError, ("EOF in multi-line statement", (lnum, 0))

            continued = 0


        while pos < max:

            pseudomatch = pseudoprog.match(line, pos)

            if pseudomatch:                                # scan for tokens

                start, end = pseudomatch.span(1)

                spos, epos, pos = (lnum, start), (lnum, end), end

                token, initial = line[start:end], line[start]


                if initial in numchars or \

                   (initial == '.' and token != '.'):      # ordinary number

                    yield (NUMBER, token, spos, epos, line)

                elif initial in '\r\n':

                    yield (parenlev > 0 and NL or NEWLINE,

                               token, spos, epos, line)

                elif initial == '#':

                    yield (COMMENT, token, spos, epos, line)

                elif token in ("'''", '"""',               # triple-quoted

                               "r'''", 'r"""', "R'''", 'R"""',

                               "u'''", 'u"""', "U'''", 'U"""',

                               "ur'''", 'ur"""', "Ur'''", 'Ur"""',

                               "uR'''", 'uR"""', "UR'''", 'UR"""'):

                    endprog = endprogs[token]

                    endmatch = endprog.match(line, pos)

                    if endmatch:                           # all on one line

                        pos = endmatch.end(0)

                        token = line[start:pos]

                        yield (STRING, token, spos, (lnum, pos), line)

                    else:

                        strstart = (lnum, start)           # multiple lines

                        contstr = line[start:]

                        contline = line

                        break

                elif initial in ("'", '"') or \

                    token[:2] in ("r'", 'r"', "R'", 'R"',

                                  "u'", 'u"', "U'", 'U"') or \

                    token[:3] in ("ur'", 'ur"', "Ur'", 'Ur"',

                                  "uR'", 'uR"', "UR'", 'UR"' ):

                    if token[-1] == '\n':                  # continued string

                        strstart = (lnum, start)

                        endprog = (endprogs[initial] or endprogs[token[1]] or

                                   endprogs[token[2]])

                        contstr, needcont = line[start:], 1

                        contline = line

                        break

                    else:                                  # ordinary string

                        yield (STRING, token, spos, epos, line)

                elif initial in namechars:                 # ordinary name

                    yield (NAME, token, spos, epos, line)

                elif initial == '\\':                      # continued stmt

                    continued = 1

                else:

                    if initial in '([{': parenlev = parenlev + 1

                    elif initial in ')]}': parenlev = parenlev - 1

                    yield (OP, token, spos, epos, line)

            else:

                yield (ERRORTOKEN, line[pos],

                           (lnum, pos), (lnum, pos+1), line)

                pos = pos + 1


    for indent in indents[1:]:                 # pop remaining indent levels

        yield (DEDENT, '', (lnum, 0), (lnum, 0), '')

    yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')


if __name__ == '__main__':                     # testing

    import sys

    if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)

    else: tokenize(sys.stdin.readline)