# rxb, a simple regular expression builder (by Ka-Ping Yee, 20 Sept 1996)
#
# 1996-10-22: changed to backslash-parens everywhere as a workaround for the
#   regex.symcomp() bug pointed out by William S. Lear <rael@dejanews.com>
#
# 1996-11-08: bug reported by Jonathan Giddy <jon@dstc.edu.au>
#   literal parentheses no longer escaped
#
# 2000-01-26: converted for re module; added sub, split, followedby

"""rxb, a simple regular expression builder (by Ka-Ping Yee, 20 Sept 1996)

From an idea by Greg Ewing on comp.lang.python.

This module encapsulates the construction and functionality of regular
expressions in a class named 'Pattern'.  To build 'Pattern's, use the
functions and constants in this module; you should not need to instance
the 'Pattern' class directly unless you are actually supplying a real
(awk-style) regular expression.

You can concatenate 'Pattern' instances using the '+' operator or repeat
them using the '*' operator with a number.
 
The available functions are:

    exactly(<literal string>)               :: exactly the given string
    anybut(<literal string>)                :: text not containing the string
    member(<literal>, <literal>, ...)       :: any single char mentioned
    nonmember(<literal>, <literal>, ...)    :: any single char not mentioned

    maybe(<pattern>)                        :: zero or one occurrence
    some(<pattern>)                         :: one or more occurrences
    any(<pattern>)                          :: zero or more occurrences
    either(<pattern>, <pattern>, ...)       :: one of the alternatives
    label(<name>, <pattern>)                :: label a subgroup for later

    followedby(<pattern>)                   :: positive lookahead assertion
    notfollowedby(<pattern>)                :: negative lookahead assertion

For 'label' you can also use the alternate, more concise syntax

    label.<name>(<pattern>)

The 'followedby' and 'notfollowedby' functions indicate that you want to
look for a match after a particular point, or make sure that there is *not*
a match after a particular point, without actually consuming any of the
string being matched.

The first four functions only accept literal strings.  The rest all
accept either literals or 'Pattern's otherwise created by this module.
Note that 'exactly()' is necessary only if used alone, since any
string will be converted from a literal to a 'Pattern' by any of the
other operations (including '+').

'member()' and 'nonmember()' accept any literal characters or strings of
characters among their arguments, as well as the special constants
'letters', 'digits', 'hexdigits', 'wordchars', and 'whitespace' from
this module.  (The corresponding constants starting with 'non-' do not work
here.) You can also give to 'member()' or 'nonmember()' a sequence created
using 'chrange(<startchar>, <endchar>)'.

For your convenience, the following 'Pattern' constants are also available:

    letter, letters                 :: any small or capital letter
    digit, digits                   :: any digit
    wordchar, wordchars             :: letter, digit, or underscore
    hexdigit, hexdigits             :: any hexadecimal digit
    whitespace                      :: space, return, newline, tab
    anychar, anychars               :: any single character

    nonletter, nondigit, nonwordchar,
    nonhexdigit, or nonwhitespace   :: any char other than the indicated type

    begline, endline                :: beginning or end of line
    anything                        :: any number of non-newlines
    something                       :: one or more non-newlines
    anyspace                        :: any amount of whitespace
    somespace                       :: one or more whitespace chars

When you're done constructing, use these 'Pattern' methods to do real work:

    match(<string>[, <start>])      :: match at beginning of string or at index
    search(<string>[, <start>])     :: find anywhere in string or after index
    sub(repl, string[, <count>])    :: substitute (at most 'count' times)
    subn(repl, string[, <count>])   :: substitute and also return count of hits
    split(string[, <pieces>])       :: split (into at most given # of pieces)
    imatch(<string>[, <start>])     :: case-insensitive match
    isearch(<string>[, <start>])    :: case-insensitive search

Each 'Pattern' will manage its own compilation.  If for some reason you
must get the compiled regular expression (compiled using Python's built-in
're' module) you can use the 'compile()' and 'icompile()' methods.

The following 'group' method and attributes work both on the 'Match' object
returned by one of the above four methods, or on the 'Pattern' object itself
where they refer to the last match or search attempt.

    found                           :: the entire string that matched
    before                          :: everything before what matched
    after                           :: everything after what matched
    group(<label>)                  :: the string that matched a group
    start([<label>])                :: the index where a group started
    end([<label>])                  :: the index where a group ended
    span([<label>])                 :: a group's starting and ending indices
    string                          :: the whole string we tried to match

If no <label> argument is given, 'group()', 'start()', 'end()', and
'span()' return information about the entire string that matched.
Instead of 'pat.group(<label>)' you can use the more concise syntax

    pat.<label>

as long as you don't use labels named 'match', 'search', etc.


A note about importing:

    Since you may be using lots of functions from this module together
    at once, you may be tempted to do 'from rxb import *'.  If you want
    to accomplish this, but don't want to damage your current namespace,
    try the 'welcome()' and 'banish()' functions instead.  The 'welcome()'
    function takes one optional argument, the module to "move in" to; if
    omitted, it defaults to the module from which welcome() is called.
    Here's what happens:

        >>> letter = 'hello'
        >>> def digit(x, y): return x + y
        >>> import rxb
        >>> letter
        'hello'
        >>> rxb.welcome()           # selected names are saved and rebound
        >>> letter
        <Pattern [A-Za-z]>
        >>> digit                   # now you don't have to write "rxb.digit"
        <Pattern [0-9]>

        # ... do your regex work here ...

        >>> rxb.banish()            # our names are restored
        >>> letter
        'hello'
        >>> digit(8, 9)
        17
"""

import re, string
error = "rxb.error"

# ------------------------------------------------------------- Pattern class

class Pattern:
    """Class encapsulating regular expression functionality.

    This class just stores one regular-expression string (in awk syntax),
    and allows you to use the addition operator (with other Patterns or
    ordinary strings) and the multiplication operator (with integers).
    It produces and caches its own compiled-regex object so you can use
    searching methods on a 'Pattern' object.  (Actually, there may be two
    regex objects, one case-sensitive and one case-insensitive.)"""

    def __init__(self, regex):      # can init with a regular expression
        self.regex = regex
        self.prog = None
        self.iprog = None
        self.lastmatch = None

    def __add__(self, other):
        return Pattern(self.regex + makepat(other).regex)

    def __radd__(self, other):
        return Pattern(makepat(other).regex + self.regex)

    def __mul__(self, number):
        return Pattern(self.regex * number)

    def __rmul__(self, number):
        return Pattern(self.regex * number)

    def __repr__(self):
        return "<Pattern " + repr(self.regex)[1:-1] + ">"

    def compile(self):
        if not self.prog:
            self.prog = re.compile(self.regex)
        return self.prog

    def icompile(self):
        if not self.iprog:
            self.iprog = regex.compile(self.regex, re.IGNORECASE)
        return self.iprog

    def search(self, string, start=0):
        self.lastmatch = self.compile().search(string, start)
        return self.lastmatch and Match(self.lastmatch)

    def isearch(self, string, start=0):
        self.lastmatch = self.icompile().search(string, start)
        return self.lastmatch and Match(self.lastmatch)

    def match(self, string, start=0):
        self.lastmatch = self.compile().match(string, start)
        return self.lastmatch and Match(self.lastmatch)

    def imatch(self, string, start=0):
        self.lastmatch = self.icompile().match(string, start)
        return self.lastmatch and Match(self.lastmatch)

    def split(self, string, pieces=0):
        if pieces == 0: return self.compile().split(string)
        if pieces == 1: return string
        return self.compile().split(string, pieces-1)

    def isplit(self, string, pieces=0):
        if pieces == 0: return self.icompile().split(string)
        if pieces == 1: return string
        return self.icompile().split(string, pieces-1)

    def sub(self, repl, string, count=0):
        return self.compile().sub(repl, string, count)

    def isub(self, repl, string, count=0):
        return self.icompile().sub(repl, string, count)

    def subn(self, repl, string, count=0):
        return self.compile().subn(repl, string, count)

    def isubn(self, repl, string, count=0):
        return self.icompile().subn(repl, string, count)

    def __getattr__(self, label):
        if label == "found":
            return self.lastmatch.group(0)
        elif label == "before":
            return self.lastmatch.string[:self.lastmatch.regs[0][0]]
        elif label == "after":
            return self.lastmatch.string[self.lastmatch.regs[0][1]:]
        elif label == "start":
            return lambda n=0, self=self: self.lastmatch.regs[n][0]
        elif label == "end":
            return lambda n=0, self=self: self.lastmatch.regs[n][1]
        elif label == "span":
            return lambda n=0, self=self: self.lastmatch.regs[n]
        elif label == "string":
            return self.match.string
        return self.lastmatch.group(label)

    def group(self, label=0):
        return self.lastmatch.group(label)

class Match:
    """A more pleasant interface to re.MatchObject."""

    def __init__(self, match):
        self.match = match

    def __repr__(self):
        return "<Match " + repr(self.group(0)) + ">"

    def __len__(self):
        return self.match.regs[0][1] - self.match.regs[0][0]

    def __getattr__(self, label):
        if label == "found":
            return self.match.group(0)
        elif label == "before":
            return self.match.string[:self.match.regs[0][0]]
        elif label == "after":
            return self.match.string[self.match.regs[0][1]:]
        elif label == "start":
            return lambda n=0, self=self: self.match.regs[n][0]
        elif label == "end":
            return lambda n=0, self=self: self.match.regs[n][1]
        elif label == "span":
            return lambda n=0, self=self: self.match.regs[n]
        elif label == "string":
            return self.match.string
        return self.match.group(label)

    def group(self, label=0):
        return self.match.group(label)

def makepat(object):
    if type(object) == type("string"):
        return exactly(object)
    else:
        return object

# ---------------------------------------------------- backward compatibility

def compile(pattern, flags=0):
    regexpr = (type(pattern) == type("string")) and pattern or pattern.regex
    return re.compile(regexpr, flags)

def icompile(pattern, flags=0):
    regexpr = (type(pattern) == type("string")) and pattern or pattern.regex
    return re.compile(regexpr, flags | re.IGNORECASE)

def match(pattern, string, start=0):
    match = compile(pattern).match(string, start)
    return match and Match(match)

def imatch(pattern, string, start=0):
    match = icompile(pattern).match(string, start)
    return match and Match(match)

def search(pattern, string, start=0):
    match = compile(pattern).search(string, start)
    return match and Match(match)

def isearch(pattern, string, start=0):
    match = icompile(pattern).search(string, start)
    return match and Match(match)

def sub(pattern, repl, string, count=0):
    return compile(pattern).sub(repl, string, count)

def isub(pattern, repl, string, count=0):
    return icompile(pattern).sub(repl, string, count)

def subn(pattern, repl, string, count=0):
    return compile(pattern).subn(repl, string, count)

def isubn(pattern, repl, string, count=0):
    return icompile(pattern).subn(repl, string, count)

def split(pattern, string, pieces=0):
    return compile(pattern).split(string, pieces)

def isplit(pattern, string, pieces=0):
    return icompile(pattern).split(string, pieces)

# ----------------------------------------------------------------- constants

letter = letters =              Pattern("[A-Za-z]")
digit = digits =                Pattern("[0-9]")
hexdigit = hexdigits =          Pattern("[A-Fa-f0-9]")
wordchar = wordchars =          Pattern("[A-Za-z0-9_]")
whitespace =                    Pattern("[ \t\r\n\f]")

nonletter = nonletters =        Pattern("[^A-Za-z]")
nondigit = nondigits =          Pattern("[^0-9]")
nonhexdigit = nonhexdigits =    Pattern("[^A-Fa-f0-9]")
nonwordchar = nonwordchars =    Pattern("[^A-Za-z0-9_]")
nonwhitespace =                 Pattern("[^ \t\r\n\f]")

begline =                       Pattern("^")
endline =                       Pattern("$")

anychar = anychars =            Pattern(".")
anything =                      Pattern(".*")
something =                     Pattern(".+")
anyspace =                      Pattern("[ \t\r\n\f]*")
somespace =                     Pattern("[ \t\r\n\f]+")

# --------------------------------------------------------- character classes

# In a bracketed character class, only \, ], and - are special.

def charclass(*chars):
    rightbracket = hyphen = caret = backslash = 0
    result = ""
    for arg in chars:
        if id(arg) == id(letter): result = result + "A-Za-z"
        elif id(arg) == id(digit): result = result + "0-9"
        elif id(arg) == id(hexdigit): result = result + "A-Fa-f0-9"
        elif id(arg) == id(wordchar): result = result + "A-Za-z0-9_"
        elif id(arg) == id(whitespace): result = result + " \t\r\n\f"
        elif type(arg) != type("string"): raise error, \
            "member() and nonmember() only accept string literals"
        else:
            for ch in arg:
                if ch == "-": hyphen = 1
                elif ch == "\\": backslash = 1
                elif ch == "]": rightbracket = 1
                elif ch == "^": caret = 1
                else: result = result + ch
    if hyphen: result = result + "-"            # - allowed at end of class
    if caret: result = result + "^"             # ^ allowed not at beginning
    if rightbracket: result = result + "\]"     # ] must be escaped
    if backslash: result = result + "\\"        # \ must be escaped
    return result

def member(*chars):
    cclass = apply(charclass, chars)
    if cclass == "^": return Pattern("^")       # special case: [^] is bad
    else: return Pattern("[" + apply(charclass, chars) + "]")

def nonmember(*chars): return Pattern("[^" + apply(charclass, chars) + "]")

members, nonmembers = member, nonmember

def chrange(start, end):
    minord, maxord = ord(start), ord(end)
    if minord > maxord: minord, maxord = maxord, minord
    result = ""
    for i in range(minord, maxord+1): result = result + chr(i)
    return result

# ------------------------------------------------------------------ escaping

def exactly(literal):
    escaped = ""
    for ch in literal:
        if ch in "^$[]\\+*?.": escaped = escaped + "\\" + ch
        else: escaped = escaped + ch
    return Pattern(escaped)

exact = exactly

def anybut(literal):
    prefix = ""
    options = []
    for ch in literal:
        if ch in "\\]": options.append(prefix + "[^\\" + ch + "]")
        else: options.append(prefix + "[^" + ch + "]")
        if ch in "^$[]\\+*?.": prefix = prefix + "\\" + ch
        else: prefix = prefix + ch
    return Pattern("(" + string.join(options, "|") + ")*")

# ------------------------------------------------------ repetition operators

charclassprog = re.compile("^\[\^?\]?([^]]|\\.)*\]$")
parenprog = re.compile("^\\([^()]*\\)$")

def atomic(expr):
    if len(expr) == 1 and expr in string.digits + string.letters: return expr
    if len(expr) == 2 and expr[0] == "\\": return expr
    if charclassprog.match(expr) > -1: return expr
    if parenprog.match(expr) > -1: return expr
    return "(" + expr + ")"

def maybe(expr): return Pattern(atomic(makepat(expr).regex) + "?")
def some(expr): return Pattern(atomic(makepat(expr).regex) + "+")
def any(expr): return Pattern(atomic(makepat(expr).regex) + "*")

# --------------------------------------------------------------- alternation

def either(*alternatives):
    options = []
    for option in alternatives:
        options.append(makepat(option).regex)
    return Pattern("(" + string.join(options, "|") + ")")

# ----------------------------------------------------- symbolic group labels

class Label:
    def __call__(self, name, expr):
        return Pattern("(?P<" + name + ">" + makepat(expr).regex + ")")

    def __getattr__(self, name):
        return lambda expr, self=self, name=name: self.__call__(name, expr)

label = Label()

# ----------------------------------------------------------------- lookahead

def followedby(expr): return Pattern("(?=" + makepat(expr).regex + ")")
def notfollowedby(expr): return Pattern("(?!" + makepat(expr).regex + ")")

# ---------------------- welcome() and banish() for moving into other modules

exports = ["chrange", "exact", "exactly", "anybut",
           "member", "members", "nonmember", "nonmembers",
           "maybe", "some", "any", "either",
           "label", "followedby", "notfollowedby",
           "letter", "letters", "nonletter", "nonletters",
           "digit", "digits", "nondigit", "nondigits",
           "hexdigit", "hexdigits", "nonhexdigit", "nonhexdigits",
           "wordchar", "wordchars", "nonwordchar", "nonwordchars",
           "anychar", "anychars", "anything", "something",
           "whitespace", "nonwhitespace", "anyspace", "somespace",
           "begline", "endline"]

displaced = {}
import __main__, sys

def callermodule():
    frame = None
    module = None

    try: 1/0
    except: frame = sys.exc_traceback.tb_frame

    while frame:
        module = frame.f_globals["__name__"]
        if module != __name__: break
        frame = frame.f_back

    return module and sys.modules[module]

def welcome(target = None):
    global displaced

    if not target: target = callermodule()
    elif type(target) == type("string"): target = sys.modules[target]

    if displaced.has_key(target.__name__):
        raise error, "welcome(): already resident in " + repr(target)

    tdict = target.__dict__
    tsave = displaced[target.__name__] = {}
    source = globals()

    for name in exports:
        if tdict.has_key(name): tsave[name] = tdict[name]
        tdict[name] = source[name]

def banish(target = None):
    global displaced

    if not target: target = callermodule()
    elif type(target) == type("string"): target = sys.modules[target]

    if not displaced.has_key(target.__name__):
        raise error, "banish(): not currently resident in " + repr(target)

    tdict = target.__dict__
    tsave = displaced[target.__name__]

    for name in exports:
        if tsave.has_key(name): tdict[name] = tsave[name]
        elif tdict.has_key(name): del tdict[name]

    del displaced[target.__name__]