"""rxb, a simple regular expression builder (by Ka-Ping Yee, 20 Sept 1996)

From an idea by Greg Ewing, on comp.lang.python.

This module encapsulates the construction and functionality of regular
expressions in a class named 'Pattern'.  To build 'Pattern's, use the
functions and constants in this module; you should not need to instance
the 'Pattern' class directly unless you are actually supplying a real
(awk-style) regular expression.

You can concatenate 'Pattern' instances using the '+' operator or repeat
them using the '*' operator with a number.
 
The available functions are:

    exactly(<literal string>)               :: exactly the string!
    anybut(<literal string>)                :: any text but the string
    member(<literal>, <literal>, ...)       :: any single char mentioned
    nonmember(<literal>, <literal>, ...)    :: any single char not mentioned

    maybe(<pattern>)                        :: possibly one occurrence
    some(<pattern>)                         :: one or more occurrences
    any(<pattern>)                          :: zero or more occurrences
    either(<pattern>, <pattern>, ...)       :: one of the alternatives
    label(<name>, <pattern>)                :: label a subgroup for later

For 'label' you can also use the alternate, more concise syntax

    label.<name>(<pattern>)

The first four functions only accept literal strings.  The rest all
accept either literals or 'Pattern's otherwise created by this module.
Note that 'exactly()' is necessary only if used alone, since any
string will be converted from a literal to a 'Pattern' by any of the
other operations (including '+').

'member()' and 'nonmember()' accept any literal characters or strings of
characters among their arguments, as well as the special constants
'letters', 'digits', 'hexdigits', 'wordchars', and 'whitespace' from
this module.  It doesn't make sense to use 'non'-anything in a set of
chars, so don't. :)  You can also give to 'member()' or 'nonmember()' a
sequence created using 'chrange(<startchar>, <endchar>)'.

For your convenience, the following 'Pattern' constants are also available:

    letter, letters                 :: any small or capital letter
    digit, digits                   :: any digit
    wordchar, wordchars             :: letter, digit, or underscore
    hexdigit, hexdigits             :: any hexadecimal digit
    whitespace                      :: space, return, newline, tab
    anychar, anychars               :: any single character

    nonletter, nondigit, nonwordchar,
    nonhexdigit, or nonwhitespace   :: any char other than the indicated type

    begline, endline                :: beginning or end of line
    anything                        :: any number of non-newlines
    something                       :: one or more non-newlines
    anyspace                        :: any amount of whitespace
    somespace                       :: one or more whitespace chars

When you're done, you should use these 'Pattern' methods to do the searching:

    match(<string>)                 :: match at beginning of string
    search(<string>)                :: find anywhere in string
    imatch(<string>)                :: case-insensitive match
    isearch(<string>)               :: case-insensitive search

    group(<label>)                  :: group after a successful match
    found                           :: the piece of string that matched
    before                          :: everything before what matched
    after                           :: everything after what matched

Each 'Pattern' will manage its own compilation.  If for some reason you
must get the compiled regular expression (compiled using Python's built-in
'regex' module) you can use the 'compile()' and 'icompile()' methods.

Instead of 'pat.group(<label>)' you can use the more concise syntax

    pat.<label>

as long as you don't use labels named 'match', 'search', etc.


A note about importing:

    Since you may be using lots of functions from this module together
    at once, you may be tempted to do 'from rxb import *'.  If you want
    to accomplish this, but don't want to damage your current namespace,
    try the 'welcome()' and 'banish()' functions instead.  The 'welcome()'
    function takes one argument, the module to "move in" to; if omitted,
    it defaults to '__main__'.  Here's what happens::

        >>> letter = 'hello'
        >>> def digit(x, y): return x + y
        >>> import rxb
        >>> letter
        'hello'
        >>> rxb.welcome()           # selected names are saved and rebound
        >>> letter
        <Pattern [A-Za-z]>
        >>> digit                   # now you don't have to write "rxb.digit"
        <Pattern [0-9]>

        # ... do your regex work here ...

        >>> rxb.banish()            # our names are restored
        >>> letter
        'hello'
        >>> digit(8, 9)
        17
"""

# 96/10/22: changed to backslash-parens everywhere as a workaround for the
#   regex.symcomp() bug pointed out by William S. Lear <rael@dejanews.com>

# 96/11/08: bug reported by Jonathan Giddy <jon@dstc.edu.au>
#   literal parentheses no longer escaped

import regex, string

from regex_syntax import RE_SYNTAX_AWK, RE_NO_BK_PARENS
RXB_SYNTAX = RE_SYNTAX_AWK & (~RE_NO_BK_PARENS)
error = 'rxb.error'

# ------------------------------------------------------------- Pattern class

class Pattern:
    """Class encapsulating regular expression functionality.

    This class just stores one regular-expression string (in awk syntax),
    and allows you to use the addition operator (with other Patterns or
    ordinary strings) and the multiplication operator (with integers).
    It produces and caches its own compiled-regex object so you can use
    searching methods on a 'Pattern' object.  (Actually, there may be two
    regex objects, one case-sensitive and one case-insensitive.)"""

    def __init__(self, regex):      # can init with a regular expression
        self.regex = regex
        self.prog = None
        self.iprog = None
        self.lastprog = None

    def __add__(self, other):
        return Pattern(self.regex + makepat(other).regex)

    def __radd__(self, other):
        return Pattern(makepat(other).regex + self.regex)

    def __mul__(self, number):
        return Pattern(self.regex * number)

    def __rmul__(self, number):
        return Pattern(self.regex * number)

    def __repr__(self):
        escaped = repr(self.regex)
        return '<Pattern ' + escaped[1:-1] + '>'

    def compile(self):
        if not self.prog:
            oldsyntax = regex.set_syntax(RXB_SYNTAX)
            self.prog = regex.symcomp(self.regex)
            regex.set_syntax(oldsyntax)
        return self.prog

    def icompile(self):
        if not self.iprog:
            oldsyntax = regex.set_syntax(RXB_SYNTAX)
            self.iprog = regex.symcomp(self.regex, regex.casefold)
            regex.set_syntax(oldsyntax)
        return self.iprog

    def search(self, string):
        self.lastprog = self.compile()
        return self.prog.search(string)

    def isearch(self, string):
        self.lastprog = self.icompile()
        return self.iprog.search(string)

    def match(self, string):
        self.lastprog = self.compile()
        return self.prog.match(string)

    def imatch(self, string):
        self.lastprog = self.icompile()
        return self.iprog.match(string)

    def __getattr__(self, label):
        if label == 'found':
            return self.lastprog.group(0)
        elif label == 'before':
            return self.lastprog.last[:self.lastprog.regs[0][0]]
        elif label == 'after':
            return self.lastprog.last[self.lastprog.regs[0][1]:]
        return self.lastprog.group(label)

    def group(self, label):
        return self.lastprog.group(label)

def makepat(object):
    if type(object) == type('string'):
        return exactly(object)
    else:
        return object

# ---------------------------------------------------- backward compatibility

def withawk(function, *args):
    oldsyntax = regex.set_syntax(RXB_SYNTAX)
    result = apply(function, args)
    regex.set_syntax(oldsyntax)
    return result

def search(pattern, string):
    regexpr = (type(pattern) == type('string')) and pattern or pattern.regex
    return withawk(regex.search, regexpr, string)

def match(pattern, string):
    regexpr = (type(pattern) == type('string')) and pattern or pattern.regex
    return withawk(regex.match, regexpr, string)

def compile(pattern, xlate = None):
    regexpr = (type(pattern) == type('string')) and pattern or pattern.regex
    if xlate: return withawk(regex.symcomp, regexpr, xlate)
    else: return withawk(regex.symcomp, regexpr)

def searchicase(pattern, string):
    prog = compile(pattern, regex.casefold)
    return prog.search(string)

def matchicase(pattern, string):
    prog = compile(pattern, regex.casefold)
    return prog.match(string)

# ----------------------------------------------------------------- constants

letter = letters =              Pattern('[A-Za-z]')
digit = digits =                Pattern('[0-9]')
hexdigit = hexdigits =          Pattern('[A-Fa-f0-9]')
wordchar = wordchars =          Pattern('[A-Za-z0-9_]')
whitespace =                    Pattern('[ \t\r\n\f]')

nonletter = nonletters =        Pattern('[^A-Za-z]')
nondigit = nondigits =          Pattern('[^0-9]')
nonhexdigit = nonhexdigits =    Pattern('[^A-Fa-f0-9]')
nonwordchar = nonwordchars =    Pattern('[^A-Za-z0-9_]')
nonwhitespace =                 Pattern('[^ \t\r\n\f]')

begline =                       Pattern('^')
endline =                       Pattern('$')

anychar = anychars =            Pattern('.')
anything =                      Pattern('.*')
something =                     Pattern('.+')
anyspace =                      Pattern('[ \t\r\n\f]*')
somespace =                     Pattern('[ \t\r\n\f]+')

# --------------------------------------------------------- character classes

# In a bracketed character class, only ] and - are special.
# ALL other characters are literal, including backslash!

def charclass(*chars):
    rightbracket = 0
    hyphen = 0
    caret = 0
    result = ''
    for arg in chars:
        if id(arg) == id(letter): result = result + 'A-Za-z'
        elif id(arg) == id(digit): result = result + '0-9'
        elif id(arg) == id(hexdigit): result = result + 'A-Fa-f0-9'
        elif id(arg) == id(wordchar): result = result + 'A-Za-z0-9_'
        elif id(arg) == id(whitespace): result = result + ' \t\r\n\f'
        elif type(arg) != type('string'): raise error, \
            "member() and nonmember() only accept string literals"
        else:
            for ch in arg:
                if ch == '-': hyphen = 1
                elif ch == ']': rightbracket = 1
                elif ch == '^': caret = 1
                else: result = result + ch
    if rightbracket: result = ']' + result      # ] allowed only at beginning
    if hyphen: result = result + '-'            # - allowed at end of class
    if caret: result = result + '^'             # - allowed not at beginning
    return result

def member(*chars):
    cclass = apply(charclass, chars)
    if cclass == '^': return Pattern('^')   # special case: [^] doesn't make sense
    else: return Pattern('[' + apply(charclass, chars) + ']')

def nonmember(*chars): return Pattern('[^' + apply(charclass, chars) + ']')

members, nonmembers = member, nonmember

def chrange(start, end):
    minord, maxord = ord(start), ord(end)
    if minord > maxord: minord, maxord = maxord, minord
    result = ''
    for i in range(minord, maxord+1): result = result + chr(i)
    return result

# ------------------------------------------------------------------ escaping

def exactly(literal):
    escaped = ''
    for ch in literal:
        if ch == '<': escaped = escaped + '[<]' # hide < from symcomp
        elif ch in '^$[]\\+*?.': escaped = escaped + '\\' + ch
        else: escaped = escaped + ch
    return Pattern(escaped)

exact = exactly

def anybut(literal):
    prefix = ''
    options = []
    for ch in literal:
        options.append(prefix + '[^' + ch + ']')
        prefix = prefix + ch
    return Pattern('\(' + string.join(options, '|') + '\)*')

# ------------------------------------------------------ repetition operators

oldsyntax = regex.set_syntax(RE_SYNTAX_AWK)
charclassprog = regex.compile('^\[\^?\]?[^]]*\]$')
parenprog = regex.compile('^\([^()]*\)$')
regex.set_syntax(oldsyntax)

def atomic(expr):
    if len(expr) == 1 and expr in string.digits + string.letters: return expr
    if len(expr) == 2 and expr[0] == '\\': return expr
    if charclassprog.match(expr) > -1: return expr
    if parenprog.match(expr) > -1: return expr
    return '\(' + expr + '\)'

def maybe(expr): return Pattern(atomic(makepat(expr).regex) + '?')
def some(expr): return Pattern(atomic(makepat(expr).regex) + '+')
def any(expr): return Pattern(atomic(makepat(expr).regex) + '*')

# --------------------------------------------------------------- alternation

def either(*alternatives):
    options = []
    for option in alternatives:
        options.append(makepat(option).regex)
    return Pattern('\(' + string.join(options, '|') + '\)')

# ----------------------------------------------------- symbolic group labels

class Label:
    def __call__(self, name, expr):
        return Pattern('\(<' + name + '>' + makepat(expr).regex + '\)')

    def __getattr__(self, name):
        return lambda expr, self=self, name=name: self.__call__(name, expr)

label = Label()

# ---------------------- welcome() and banish() for moving into other modules

exports = ['chrange', 'exact', 'exactly', 'anybut',
    'member', 'members', 'nonmember', 'nonmembers',
    'maybe', 'some', 'any', 'either', 'label',
    'letter', 'letters', 'nonletter', 'nonletters',
    'digit', 'digits', 'nondigit', 'nondigits',
    'hexdigit', 'hexdigits', 'nonhexdigit', 'nonhexdigits',
    'wordchar', 'wordchars', 'nonwordchar', 'nonwordchars',
    'anychar', 'anychars', 'anything', 'something',
    'whitespace', 'nonwhitespace', 'anyspace', 'somespace',
    'begline', 'endline',
    'search', 'searchicase', 'match', 'matchicase', 'compile']

displaced = {}
import __main__, sys

def callermodule():
    frame = None
    module = None

    try: 1/0
    except: frame = sys.exc_traceback.tb_frame

    while frame:
        module = frame.f_globals['__name__']
        if module != __name__: break
        frame = frame.f_back

    return module and sys.modules[module]

def welcome(target = None):
    global displaced

    if not target: target = callermodule()
    elif type(target) == type('string'): target = sys.modules[target]

    if displaced.has_key(target.__name__):
        raise error, "welcome(): already resident in " + repr(target)

    tdict = target.__dict__
    tsave = displaced[target.__name__] = {}
    source = globals()

    for name in exports:
        if tdict.has_key(name): tsave[name] = tdict[name]
        tdict[name] = source[name]

def banish(target = None):
    global displaced

    if not target: target = callermodule()
    elif type(target) == type('string'): target = sys.modules[target]

    if not displaced.has_key(target.__name__):
        raise error, "banish(): not currently resident in " + repr(target)

    tdict = target.__dict__
    tsave = displaced[target.__name__]

    for name in exports:
        if tsave.has_key(name): tdict[name] = tsave[name]
        elif tdict.has_key(name): del tdict[name]

    del displaced[target.__name__]
